2.2.8. 将字符实体替换为Unicode数字实体:replaceStrEntToNumEnt
#------------------------------------------------------------------------------
# convert the string entity to unicode unmber entity
# refer: http://www.htmlhelp.com/reference/html40/entities/latin1.html
# TODO: need later use this htmlentitydefs instead following
def replaceStrEntToNumEnt(text) :
strToNumEntDict = {
# Latin-1 Entities
" " : " ",
"¡" : "¡",
"¢" : "¢",
"£" : "£",
"¤" : "¤",
"¥" : "¥",
"¦" : "¦",
"§" : "§",
"¨" : "¨",
"©" : "©",
"ª" : "ª",
"«" : "«",
"¬" : "¬",
"­" : "­",
"®" : "®",
"¯" : "¯",
"°" : "°",
"±" : "±",
"²" : "²",
"³" : "³",
"´" : "´",
"µ" : "µ",
"¶" : "¶",
"·" : "·",
"¸" : "¸",
"¹" : "¹",
"º" : "º",
"»" : "»",
"¼" : "¼",
"½" : "½",
"¾" : "¾",
"¿" : "¿",
"À" : "À",
"Á" : "Á",
"Â" : "Â",
"Ã" : "Ã",
"Ä" : "Ä",
"Å" : "Å",
"Æ" : "Æ",
"Ç" : "Ç",
"È" : "È",
"É" : "É",
"Ê" : "Ê",
"Ë" : "Ë",
"Ì" : "Ì",
"Í" : "Í",
"Î" : "Î",
"Ï" : "Ï",
"Ð" : "Ð",
"Ñ" : "Ñ",
"Ò" : "Ò",
"Ó" : "Ó",
"Ô" : "Ô",
"Õ" : "Õ",
"Ö" : "Ö",
"×" : "×",
"Ø" : "Ø",
"Ù" : "Ù",
"Ú" : "Ú",
"Û" : "Û",
"Ü" : "Ü",
"Ý" : "Ý",
"Þ" : "Þ",
"ß" : "ß",
"à" : "à",
"á" : "á",
"â" : "â",
"ã" : "ã",
"ä" : "ä",
"å" : "å",
"æ" : "æ",
"ç" : "ç",
"è" : "è",
"é" : "é",
"ê" : "ê",
"ë" : "ë",
"ì" : "ì",
"í" : "í",
"î" : "î",
"ï" : "ï",
"ð" : "ð",
"ñ" : "ñ",
"ò" : "ò",
"ó" : "ó",
"ô" : "ô",
"õ" : "õ",
"ö" : "ö",
"÷" : "÷",
"ø" : "ø",
"ù" : "ù",
"ú" : "ú",
"û" : "û",
"ü" : "ü",
"ý" : "ý",
"þ" : "þ",
"ÿ" : "ÿ",
# http://www.htmlhelp.com/reference/html40/entities/special.html
# Special Entities
""" : """,
"&" : "&",
"<" : "<",
">" : ">",
"Œ" : "Œ",
"œ" : "œ",
"Š" : "Š",
"š" : "š",
"Ÿ" : "Ÿ",
"ˆ" : "ˆ",
"˜" : "˜",
" " : " ",
" " : " ",
" " : " ",
"‌" : "‌",
"‍" : "‍",
"‎" : "‎",
"‏" : "‏",
"–" : "–",
"—" : "—",
"‘" : "‘",
"’" : "’",
"‚" : "‚",
"“" : "“",
"”" : "”",
"„" : "„",
"†" : "†",
"‡" : "‡",
"‰" : "‰",
"‹" : "‹",
"›" : "›",
"€" : "€",
}
replacedText = text;
for key in strToNumEntDict.keys() :
replacedText = re.compile(key).sub(strToNumEntDict[key], replacedText);
return replacedText;
例 2.12. replaceStrEntToNumEnt的使用范例
line = replaceStrEntToNumEnt(line);