#!/usr/bin/python
import re
import sys
def print_record(codepoint, letter):
print (unichr(codepoint) + "\t" + letter).encode("utf-8")
def main():
pattern = re.compile("^([0-9A-F]+);LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH ")
for line in sys.stdin.readlines():
groups = pattern.match(line)
if groups:
codepoint = int(groups.group(1), 16)
letter = groups.group(3)
if groups.group(2) == "SMALL":
letter = letter.lower()
print_record(codepoint, letter)
# some special cases
print_record(0x00c6, "A") # should be "AE"?
print_record(0x00df, "S") # should be "ss"? (note: also wrong case)
print_record(0x00e6, "a") # should be "ae"?
print_record(0x0131, "i") #
print_record(0x0132, "I") # should be "IJ"?
print_record(0x0133, "i") # should be "ij"?
print_record(0x0138, "k") #
print_record(0x0149, "n") # "'n"
print_record(0x014a, "N") #
print_record(0x014b, "n") #
print_record(0x0152, "E") # should be "OE"?
print_record(0x0153, "e") # should be "oe"?
print_record(0x0401, u"\u0415") # (note: Cyrillic YO -> Cyrillic YE)
print_record(0x0451, u"\u0435") # (note: Cyrillic yo -> Cyrillic ye)
if __name__ == "__main__":
main()