#!/usr/bin/python import re import sys def print_record(codepoint, letter): print (unichr(codepoint) + "\t" + letter).encode("utf-8") def main(): pattern = re.compile("^([0-9A-F]+);LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH ") for line in sys.stdin.readlines(): groups = pattern.match(line) if groups: codepoint = int(groups.group(1), 16) letter = groups.group(3) if groups.group(2) == "SMALL": letter = letter.lower() print_record(codepoint, letter) # some special cases print_record(0x00c6, "A") # should be "AE"? print_record(0x00df, "S") # should be "ss"? (note: also wrong case) print_record(0x00e6, "a") # should be "ae"? print_record(0x0131, "i") # print_record(0x0132, "I") # should be "IJ"? print_record(0x0133, "i") # should be "ij"? print_record(0x0138, "k") # print_record(0x0149, "n") # "'n" print_record(0x014a, "N") # print_record(0x014b, "n") # print_record(0x0152, "E") # should be "OE"? print_record(0x0153, "e") # should be "oe"? print_record(0x0401, u"\u0415") # (note: Cyrillic YO -> Cyrillic YE) print_record(0x0451, u"\u0435") # (note: Cyrillic yo -> Cyrillic ye) if __name__ == "__main__": main()