#!/usr/bin/python import re import sys def print_record(codepoint, letter): print (unichr(codepoint) + "\t" + letter).encode("utf-8") def is_latin_letter(codepoint): return (codepoint >= ord('a') and codepoint <= ord('z')) or (codepoint >= ord('A') and codepoint <= ord('Z')) def main(): # http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings decomposition_type_pattern = re.compile(" *<[^>]*> *") for line in sys.stdin.readlines(): fields = line.split(";") if len(fields) > 5: # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt general_category = fields[2] decomposition = fields[5] decomposition = re.sub(decomposition_type_pattern, ' ', decomposition) # http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values if general_category.startswith('L'): codepoint = int(fields[0], 16) nfd = [int(s, 16) for s in decomposition.split(" ") if s != ""] if len(nfd) > 1 and is_latin_letter(nfd[0]): print_record(codepoint, chr(nfd[0])) # some special cases print_record(0x00c6, "A") # LATIN CAPITAL LETTER AE print_record(0x00df, "S") # LATIN SMALL LETTER SHARP S print_record(0x00e6, "a") # LATIN SMALL LETTER AE print_record(0x0131, "i") # LATIN SMALL LETTER DOTLESS I print_record(0x0132, "I") # LATIN CAPITAL LIGATURE IJ print_record(0x0133, "i") # LATIN SMALL LIGATURE IJ print_record(0x0138, "k") # LATIN SMALL LETTER KRA print_record(0x0149, "n") # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE print_record(0x014a, "N") # LATIN CAPITAL LETTER ENG print_record(0x014b, "n") # LATIN SMALL LETTER ENG print_record(0x0152, "E") # LATIN CAPITAL LIGATURE OE print_record(0x0153, "e") # LATIN SMALL LIGATURE OE print_record(0x0401, u"\u0415") # CYRILLIC CAPITAL LETTER IO print_record(0x0451, u"\u0435") # CYRILLIC SMALL LETTER IO if __name__ == "__main__": main()