#------------------------------------------------------------------------------ # got python script self file name # extract out xxx from: # D:\yyy\zzz\xxx.py # xxx.py def extractFilename(inputStr) : argv0List = inputStr.split("\\"); scriptName = argv0List[len(argv0List) - 1]; # get script file name self possibleSuf = scriptName[-3:]; if possibleSuf == ".py" : scriptName = scriptName[0:-3]; # remove ".py" return scriptName;
例 2.5. extractFilename的使用范例
if __name__=="__main__": # for : python xxx.py -s yyy # -> sys.argv[0]=xxx.py # for : xxx.py -s yyy # -> sys.argv[0]=D:\yyy\zzz\xxx.py scriptSelfName = extractFilename(sys.argv[0]);
#------------------------------------------------------------------------------ # replace the &#N; (N is digit number, N > 1) to unicode char # eg: replace "'" with "'" in "Creepin' up on you" def repUniNumEntToChar(text): unicodeP = re.compile('&#[0-9]+;'); def transToUniChr(match): # translate the matched string to unicode char numStr = match.group(0)[2:-1]; # remove '&#' and ';' num = int(numStr); unicodeChar = unichr(num); return unicodeChar; return unicodeP.sub(transToUniChr, text);
#------------------------------------------------------------------------------ # generate the full url, which include the main url plus the parameter list # Note: # normally just use urllib.urlencode is OK. # only use this if you do NOT want urllib.urlencode convert some special chars($,:,{,},...) into %XX def genFullUrl(mainUrl, paraDict) : fullUrl = mainUrl; fullUrl += '?'; for i, para in enumerate(paraDict.keys()) : if(i == 0): # first para no '&' fullUrl += str(para) + '=' + str(paraDict[para]); else : fullUrl += '&' + str(para) + '=' + str(paraDict[para]); return fullUrl;
例 2.7. genFullUrl的使用范例
# Note: here not use urllib.urlencode to encode para, # for the encoded result will convert some special chars($,:,{,},...) into %XX paraDict = { 'asyn' : '1', 'thread_id_enc' : '', 'start' : '', 'count' : '', 'orderby_type' : '0', }; paraDict['thread_id_enc'] = str(threadIdEnc); paraDict['start'] = str(startCmtIdx); paraDict['count'] = str(reqCmtNum); paraDict['t'] = str(cmtReqTime); mainUrl = "http://hi.baidu.com/cmt/spcmt/get_thread"; getCmtUrl = genFullUrl(mainUrl, paraDict);
#------------------------------------------------------------------------------ # check whether two url is similar # note: input two url both should be str type def urlIsSimilar(url1, url2) : isSim = False; url1 = str(url1); url2 = str(url2); slashList1 = url1.split('/'); slashList2 = url2.split('/'); lenS1 = len(slashList1); lenS2 = len(slashList2); # all should have same structure if lenS1 != lenS2 : # not same sturcture -> must not similar isSim = False; else : sufPos1 = url1.rfind('.'); sufPos2 = url2.rfind('.'); suf1 = url1[(sufPos1 + 1) : ]; suf2 = url2[(sufPos2 + 1) : ]; # at least, suffix should same if (suf1 == suf2) : lastSlashPos1 = url1.rfind('/'); lastSlashPos2 = url2.rfind('/'); exceptName1 = url1[:lastSlashPos1]; exceptName2 = url2[:lastSlashPos2]; # except name, all other part should same if (exceptName1 == exceptName2) : isSim = True; else : # except name, other part is not same -> not similar isSim = False; else : # suffix not same -> must not similar isSim = False; return isSim;
如果相似,返回True和相似的地址;
如果不相似,返回False。
#------------------------------------------------------------------------------ # found whether the url is similar in urlList # if found, return True, similarSrcUrl # if not found, return False, '' def findSimilarUrl(url, urlList) : (isSimilar, similarSrcUrl) = (False, ''); for srcUrl in urlList : if urlIsSimilar(url, srcUrl) : isSimilar = True; similarSrcUrl = srcUrl; break; return (isSimilar, similarSrcUrl);
例 2.9. findSimilarUrl的使用范例
# to check is similar, only when need check and the list it not empty if ((gCfg['omitSimErrUrl'] == 'yes') and gVal['errorUrlList']): (isSimilar, simSrcUrl) = findSimilarUrl(curUrl, gVal['errorUrlList']); if isSimilar : logging.warning(" Omit process %s for similar with previous error url", curUrl); logging.warning(" %s", simSrcUrl); continue;
#------------------------------------------------------------------------------ # remove non-word char == only retian alphanumeric character (char+number) and underscore # eg: # from againinput4@yeah to againinput4yeah # from green-waste to greenwaste def removeNonWordChar(inputString) : return re.sub(r"[^\w]", "", inputString); # non [a-zA-Z0-9_]
例 2.10. removeNonWordChar的使用范例
wxrValidUsername = removeNonWordChar(gVal['blogUser']); wxrValidUsername = wxrValidUsername.replace("_", ""); logging.info("Generated WXR safe username is %s", wxrValidUsername);
使得处理后的字符串,在XML都是合法的了。
#------------------------------------------------------------------------------ # remove control character from input string # otherwise will cause wordpress importer import failed # for wordpress importer, if contains contrl char, will fail to import wxr # eg: # 1. http://againinput4.blog.163.com/blog/static/172799491201110111145259/ # content contains some invalid ascii control chars # 2. http://hi.baidu.com/notebookrelated/blog/item/8bd88e351d449789a71e12c2.html # 165th comment contains invalid control char: ETX # 3. http://green-waste.blog.163.com/blog/static/32677678200879111913911/ # title contains control char:DC1, BS, DLE, DLE, DLE, DC1 def removeCtlChr(inputString) : validContent = ''; for c in inputString : asciiVal = ord(c); validChrList = [ 9, # 9=\t=tab 10, # 10=\n=LF=Line Feed=换行 13, # 13=\r=CR=回车 ]; # filter out others ASCII control character, and DEL=delete isValidChr = True; if (asciiVal == 0x7F) : isValidChr = False; elif ((asciiVal < 32) and (asciiVal not in validChrList)) : isValidChr = False; if(isValidChr) : validContent += c; return validContent;
例 2.11. removeCtlChr的使用范例
# remove the control char in title: # eg; # http://green-waste.blog.163.com/blog/static/32677678200879111913911/ # title contains control char:DC1, BS, DLE, DLE, DLE, DC1 infoDict['title'] = removeCtlChr(infoDict['title']);
关于控制字符 | |
---|---|
如果不了解什么是控制字符,请参考:ASCII字符集中的功能/控制字符 |
#------------------------------------------------------------------------------ # convert the string entity to unicode unmber entity # refer: http://www.htmlhelp.com/reference/html40/entities/latin1.html # TODO: need later use this htmlentitydefs instead following def replaceStrEntToNumEnt(text) : strToNumEntDict = { # Latin-1 Entities " " : " ", "¡" : "¡", "¢" : "¢", "£" : "£", "¤" : "¤", "¥" : "¥", "¦" : "¦", "§" : "§", "¨" : "¨", "©" : "©", "ª" : "ª", "«" : "«", "¬" : "¬", "­" : "­", "®" : "®", "¯" : "¯", "°" : "°", "±" : "±", "²" : "²", "³" : "³", "´" : "´", "µ" : "µ", "¶" : "¶", "·" : "·", "¸" : "¸", "¹" : "¹", "º" : "º", "»" : "»", "¼" : "¼", "½" : "½", "¾" : "¾", "¿" : "¿", "À" : "À", "Á" : "Á", "Â" : "Â", "Ã" : "Ã", "Ä" : "Ä", "Å" : "Å", "Æ" : "Æ", "Ç" : "Ç", "È" : "È", "É" : "É", "Ê" : "Ê", "Ë" : "Ë", "Ì" : "Ì", "Í" : "Í", "Î" : "Î", "Ï" : "Ï", "Ð" : "Ð", "Ñ" : "Ñ", "Ò" : "Ò", "Ó" : "Ó", "Ô" : "Ô", "Õ" : "Õ", "Ö" : "Ö", "×" : "×", "Ø" : "Ø", "Ù" : "Ù", "Ú" : "Ú", "Û" : "Û", "Ü" : "Ü", "Ý" : "Ý", "Þ" : "Þ", "ß" : "ß", "à" : "à", "á" : "á", "â" : "â", "ã" : "ã", "ä" : "ä", "å" : "å", "æ" : "æ", "ç" : "ç", "è" : "è", "é" : "é", "ê" : "ê", "ë" : "ë", "ì" : "ì", "í" : "í", "î" : "î", "ï" : "ï", "ð" : "ð", "ñ" : "ñ", "ò" : "ò", "ó" : "ó", "ô" : "ô", "õ" : "õ", "ö" : "ö", "÷" : "÷", "ø" : "ø", "ù" : "ù", "ú" : "ú", "û" : "û", "ü" : "ü", "ý" : "ý", "þ" : "þ", "ÿ" : "ÿ", # http://www.htmlhelp.com/reference/html40/entities/special.html # Special Entities """ : """, "&" : "&", "<" : "<", ">" : ">", "Œ" : "Œ", "œ" : "œ", "Š" : "Š", "š" : "š", "Ÿ" : "Ÿ", "ˆ" : "ˆ", "˜" : "˜", " " : " ", " " : " ", " " : " ", "‌" : "‌", "‍" : "‍", "‎" : "‎", "‏" : "‏", "–" : "–", "—" : "—", "‘" : "‘", "’" : "’", "‚" : "‚", "“" : "“", "”" : "”", "„" : "„", "†" : "†", "‡" : "‡", "‰" : "‰", "‹" : "‹", "›" : "›", "€" : "€", } replacedText = text; for key in strToNumEntDict.keys() : replacedText = re.compile(key).sub(strToNumEntDict[key], replacedText); return replacedText;
#------------------------------------------------------------------------------ # convert the xxx=yyy into tuple('xxx', yyy), then return the tuple value # [makesure input string] # (1) is not include whitespace # (2) include '=' # (3) last is no ';' # [possible input string] # blogUserName="againinput4" # publisherEmail="" # synchMiniBlog=false # publishTime=1322129849397 # publisherName=null # publisherNickname="\u957F\u5927\u662F\u70E6\u607C" def convertToTupleVal(equationStr) : (key, value) = ('', None); try : # Note: # here should not use split with '=', for maybe input string contains string like this: # http://img.bimg.126.net/photo/hmZoNQaqzZALvVp0rE7faA==/0.jpg # so use find('=') instead firstEqualPos = equationStr.find("="); key = equationStr[0:firstEqualPos]; valuePart = equationStr[(firstEqualPos + 1):]; # string type valLen = len(valuePart); if valLen >= 2 : # maybe string if valuePart[0] == '"' and valuePart[-1] == '"' : # is string type value = str(valuePart[1:-1]); elif (valuePart.lower() == 'null'): value = None; elif (valuePart.lower() == 'false'): value = False; elif (valuePart.lower() == 'true') : value = True; else : # must int value value = int(valuePart); else : # len=1 -> must be value value = int(valuePart); #print "Convert %s to [%s]=%s"%(equationStr, key, value); except : (key, value) = ('', None); print "Fail of convert the equal string %s to value"%(equationStr); return (key, value);
例 2.13. convertToTupleVal的使用范例
# (4) convert to value for equation in equationList : (key, value) = convertToTupleVal(equation);
#------------------------------------------------------------------------------ # remove the empty ones in list def removeEmptyInList(list) : newList = []; for val in list : if val : newList.append(val); return newList;
例 2.14. removeEmptyInList的使用范例
# Note: some list contain [u''], so is not meaningful, remove it here # for only [] is empty, [u''] is not empty -> error while exporting to WXR infoDict['tags'] = removeEmptyInList(infoDict['tags']);
#------------------------------------------------------------------------------ # remove overlapped item in the list def uniqueList(old_list): newList = [] for x in old_list: if x not in newList : newList.append(x) return newList
#------------------------------------------------------------------------------ # for listToFilter, remove the ones which is in listToCompare # also return the ones which is already exist in listToCompare def filterList(listToFilter, listToCompare) : filteredList = []; existedList = []; for singleOne in listToFilter : # remove processed if (not(singleOne in listToCompare)) : # omit the ones in listToCompare filteredList.append(singleOne); else : # record the already exist ones existedList.append(singleOne); return (filteredList, existedList);
例 2.16. filterList的使用范例
# remove processed and got ones that has been processed (filteredPicList, existedList) = filterList(nonOverlapList, gVal['processedUrlList']);
#------------------------------------------------------------------------------ # generated the random digits number string # max digit number is 12 def randDigitsStr(digitNum = 12) : if(digitNum > 12): digitNum = 12; randVal = random.random(); #print "randVal=",randVal; #randVal= 0.134248340235 randVal = str(randVal); #print "randVal=",randVal; #randVal= 0.134248340235 randVal = randVal.replace("0.", ""); #print "randVal=",randVal; #randVal= 0.134248340235 # if last is 0, append that 0 if(len(randVal)==11): randVal = randVal + "0"; #print "randVal=",randVal; #randVal= 0.134248340235 #randVal = randVal.replace("e+11", ""); #randVal = randVal.replace(".", ""); #print "randVal=",randVal; #randVal= 0.134248340235 randVal = randVal[0 : digitNum]; #print "randVal=",randVal; #randVal= 0.134248340235 return randVal;
#------------------------------------------------------------------------------ # convert tuple list to dict value # [(u'type', u'text/javascript'), (u'src', u'http://partner.googleadservices.com/gampad/google_service.js')] # { u'type':u'text/javascript', u'src':u'http://partner.googleadservices.com/gampad/google_service.js' } def tupleListToDict(tupleList): convertedDict = {}; for eachTuple in tupleList: (key, value) = eachTuple; convertedDict[key] = value; return convertedDict;
例 2.18. tupleListToDict 的使用范例
#singleContent: name=script, attrMap=None, attrs=[(u'type', u'text/javascript'), (u'src', u'http://partner.googleadservices.com/gampad/google_service.js')] attrsDict = tupleListToDict(singleContent.attrs);