顺序 | 访问地址 | 访问类型 | 发送的数据 | 需要获得/提取的返回的值 |
1 | http://www.baidu.com/ | GET | 无 | 返回的cookie中的BAIDUID |
2 | https://passport.baidu.com/v2/api/?getapi&class=login&tpl=mn&tangram=true | GET | 包含BAIDUID这个cookie | 从返回的html中提取出token的值 |
3 | https://passport.baidu.com/v2/api/?login | POST | 一堆的post data,其中token的值是之前提取出来的 | 需要验证返回的cookie中,是否包含BDUSS,PTOKEN,STOKEN,SAVEUSERID |
【版本1:Python实现模拟登陆百度首页的完整代码 之 精简版】
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | #!/usr/bin/python # -*- coding: utf-8 -*- """ Function: Used to demostrate how to use Python code to emulate login baidu main page: http://www.baidu.com/ Note: Before try to understand following code, firstly, please read the related articles: (1)【整理】关于抓取网页,分析网页内容,模拟登陆网站的逻辑/流程和注意事项 (2) 【教程】手把手教你如何利用工具(IE9的F12)去分析模拟登陆网站(百度首页)的内部逻辑过程 (3) 【教程】模拟登陆网站 之 Python版 Version: 2012-11-06 Author: Crifan """ import re; import cookielib; import urllib; import urllib2; import optparse; #------------------------------------------------------------------------------ # check all cookies in cookiesDict is exist in cookieJar or not def checkAllCookiesExist(cookieNameList, cookieJar) : cookiesDict = {}; for eachCookieName in cookieNameList : cookiesDict[eachCookieName] = False ; allCookieFound = True ; for cookie in cookieJar : if (cookie.name in cookiesDict) : cookiesDict[cookie.name] = True ; for eachCookie in cookiesDict.keys() : if ( not cookiesDict[eachCookie]) : allCookieFound = False ; break ; return allCookieFound; #------------------------------------------------------------------------------ # just for print delimiter def printDelimiter(): print '-' * 80 ; #------------------------------------------------------------------------------ # main function to emulate login baidu def emulateLoginBaidu(): print "Function: Used to demostrate how to use Python code to emulate login baidu main page: http://www.baidu.com/" ; print "Usage: emulate_login_baidu_python.py -u yourBaiduUsername -p yourBaiduPassword" ; printDelimiter(); # parse input parameters parser = optparse.OptionParser(); parser.add_option( "-u" , "--username" ,action = "store" , type = "string" ,default = '',dest = "username" , help = "Your Baidu Username" ); parser.add_option( "-p" , "--password" ,action = "store" , type = "string" ,default = '',dest = "password" , help = "Your Baidu password" ); (options, args) = parser.parse_args(); # export all options variables, then later variables can be used for i in dir (options): exec (i + " = options." + i); printDelimiter(); print "[preparation] using cookieJar & HTTPCookieProcessor to automatically handle cookies" ; cj = cookielib.CookieJar(); opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)); urllib2.install_opener(opener); printDelimiter(); print "[step1] to get cookie BAIDUID" ; resp = urllib2.urlopen(baiduMainUrl); #respInfo = resp.info(); #print "respInfo=",respInfo; for index, cookie in enumerate (cj): print '[' ,index, ']' ,cookie; printDelimiter(); print "[step2] to get token value" ; getapiResp = urllib2.urlopen(getapiUrl); #print "getapiResp=",getapiResp; getapiRespHtml = getapiResp.read(); #print "getapiRespHtml=",getapiRespHtml; #bdPass.api.params.login_token='5ab690978812b0e7fbbe1bfc267b90b3'; foundTokenVal = re.search( "bdPass\.api\.params\.login_token='(?P<tokenVal>\w+)';" , getapiRespHtml); if (foundTokenVal): tokenVal = foundTokenVal.group( "tokenVal" ); print "tokenVal=" ,tokenVal; printDelimiter(); print "[step3] emulate login baidu" ; postDict = { #'ppui_logintime': "", 'charset' : "utf-8" , #'codestring' : "", 'token' : tokenVal, #de3dbf1e8596642fa2ddf2921cd6257f 'isPhone' : "false" , 'index' : "0" , #'u' : "", #'safeflg' : "0", 'staticpage' : staticpage, #http%3A%2F%2Fwww.baidu.com%2Fcache%2Fuser%2Fhtml%2Fjump.html 'loginType' : "1" , 'tpl' : "mn" , 'callback' : "parent.bdPass.api.login._postCallback" , 'username' : username, 'password' : password, #'verifycode' : "", 'mem_pass' : "on" , }; postData = urllib.urlencode(postDict); # here will automatically encode values of parameters # such as: # encode http://www.baidu.com/cache/user/html/jump.html into http%3A%2F%2Fwww.baidu.com%2Fcache%2Fuser%2Fhtml%2Fjump.html #print "postData=",postData; req = urllib2.Request(baiduMainLoginUrl, postData); # in most case, for do POST request, the content-type, is application/x-www-form-urlencoded req.add_header( 'Content-Type' , "application/x-www-form-urlencoded" ); resp = urllib2.urlopen(req); #for index, cookie in enumerate(cj): # print '[',index, ']',cookie; cookiesToCheck = [ 'BDUSS' , 'PTOKEN' , 'STOKEN' , 'SAVEUSERID' ]; loginBaiduOK = checkAllCookiesExist(cookiesToCheck, cj); if (loginBaiduOK): print "+++ Emulate login baidu is OK, ^_^" ; else : print "--- Failed to emulate login baidu !" else : print "Fail to extract token value from html=" ,getapiRespHtml; if __name__ = = "__main__" : emulateLoginBaidu(); |
【版本2:Python实现模拟登陆百度首页的完整代码 之 crifanLib.py版】
这个是另外一个版本,其中利用到我自己的python库:crifanLib.py :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 | #!/usr/bin/python # -*- coding: utf-8 -*- """ Function: Used to demostrate how to use Python code to emulate login baidu main page: http://www.baidu.com/ Use the functions from crifanLib.py Note: Before try to understand following code, firstly, please read the related articles: (1)【整理】关于抓取网页,分析网页内容,模拟登陆网站的逻辑/流程和注意事项 (2) 【教程】手把手教你如何利用工具(IE9的F12)去分析模拟登陆网站(百度首页)的内部逻辑过程 (3) 【教程】模拟登陆网站 之 Python版 Version: 2012-11-07 Author: Crifan Contact: admin (at) crifan.com """ import re; import cookielib; import urllib; import urllib2; import optparse; #=============================================================================== # following are some functions, extracted from my python library: crifanLib.py # for the whole crifanLib.py: # download : http://code.google.com/p/crifanlib/downloads/list #=============================================================================== import zlib; gConst = { 'constUserAgent' : 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)' , #'constUserAgent' : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1", } ################################################################################ # Network: urllib/urllib2/http ################################################################################ #------------------------------------------------------------------------------ # get response from url # note: if you have already used cookiejar, then here will automatically use it # while using rllib2.Request def getUrlResponse(url, postDict = {}, headerDict = {}, timeout = 0 , useGzip = False ) : # makesure url is string, not unicode, otherwise urllib2.urlopen will error url = str (url); if (postDict) : postData = urllib.urlencode(postDict); req = urllib2.Request(url, postData); req.add_header( 'Content-Type' , "application/x-www-form-urlencoded" ); else : req = urllib2.Request(url); if (headerDict) : #print "added header:",headerDict; for key in headerDict.keys() : req.add_header(key, headerDict[key]); defHeaderDict = { 'User-Agent' : gConst[ 'constUserAgent' ], 'Cache-Control' : 'no-cache' , 'Accept' : '*/*' , 'Connection' : 'Keep-Alive' , }; # add default headers firstly for eachDefHd in defHeaderDict.keys() : #print "add default header: %s=%s"%(eachDefHd,defHeaderDict[eachDefHd]); req.add_header(eachDefHd, defHeaderDict[eachDefHd]); if (useGzip) : #print "use gzip for",url; req.add_header( 'Accept-Encoding' , 'gzip, deflate' ); # add customized header later -> allow overwrite default header if (headerDict) : #print "added header:",headerDict; for key in headerDict.keys() : req.add_header(key, headerDict[key]); if (timeout > 0 ) : # set timeout value if necessary resp = urllib2.urlopen(req, timeout = timeout); else : resp = urllib2.urlopen(req); return resp; #------------------------------------------------------------------------------ # get response html==body from url #def getUrlRespHtml(url, postDict={}, headerDict={}, timeout=0, useGzip=False) : def getUrlRespHtml(url, postDict = {}, headerDict = {}, timeout = 0 , useGzip = True ) : resp = getUrlResponse(url, postDict, headerDict, timeout, useGzip); respHtml = resp.read(); if (useGzip) : #print "---before unzip, len(respHtml)=",len(respHtml); respInfo = resp.info(); # Server: nginx/1.0.8 # Date: Sun, 08 Apr 2012 12:30:35 GMT # Content-Type: text/html # Transfer-Encoding: chunked # Connection: close # Vary: Accept-Encoding # ... # Content-Encoding: gzip # sometime, the request use gzip,deflate, but actually returned is un-gzip html # -> response info not include above "Content-Encoding: gzip" # -> so here only decode when it is indeed is gziped data if ( ( "Content-Encoding" in respInfo) and (respInfo[ 'Content-Encoding' ] = = "gzip" )) : respHtml = zlib.decompress(respHtml, 16 + zlib.MAX_WBITS); #print "+++ after unzip, len(respHtml)=",len(respHtml); return respHtml; ################################################################################ # Cookies ################################################################################ #------------------------------------------------------------------------------ # check all cookies in cookiesDict is exist in cookieJar or not def checkAllCookiesExist(cookieNameList, cookieJar) : cookiesDict = {}; for eachCookieName in cookieNameList : cookiesDict[eachCookieName] = False ; allCookieFound = True ; for cookie in cookieJar : if (cookie.name in cookiesDict) : cookiesDict[cookie.name] = True ; for eachCookie in cookiesDict.keys() : if ( not cookiesDict[eachCookie]) : allCookieFound = False ; break ; return allCookieFound; #=============================================================================== #------------------------------------------------------------------------------ # just for print delimiter def printDelimiter(): print '-' * 80 ; #------------------------------------------------------------------------------ # main function to emulate login baidu def emulateLoginBaidu(): print "Function: Used to demostrate how to use Python code to emulate login baidu main page: http://www.baidu.com/" ; print "Usage: emulate_login_baidu_python.py -u yourBaiduUsername -p yourBaiduPassword" ; printDelimiter(); # parse input parameters parser = optparse.OptionParser(); parser.add_option( "-u" , "--username" ,action = "store" , type = "string" ,default = '',dest = "username" , help = "Your Baidu Username" ); parser.add_option( "-p" , "--password" ,action = "store" , type = "string" ,default = '',dest = "password" , help = "Your Baidu password" ); (options, args) = parser.parse_args(); # export all options variables, then later variables can be used for i in dir (options): exec (i + " = options." + i); printDelimiter(); print "[preparation] using cookieJar & HTTPCookieProcessor to automatically handle cookies" ; cj = cookielib.CookieJar(); opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)); urllib2.install_opener(opener); printDelimiter(); print "[step1] to get cookie BAIDUID" ; resp = getUrlResponse(baiduMainUrl); # here you should see: BAIDUID for index, cookie in enumerate (cj): print '[' ,index, ']' ,cookie; printDelimiter(); print "[step2] to get token value" ; getapiRespHtml = getUrlRespHtml(getapiUrl); #bdPass.api.params.login_token='5ab690978812b0e7fbbe1bfc267b90b3'; foundTokenVal = re.search( "bdPass\.api\.params\.login_token='(?P<tokenVal>\w+)';" , getapiRespHtml); if (foundTokenVal): tokenVal = foundTokenVal.group( "tokenVal" ); print "tokenVal=" ,tokenVal; printDelimiter(); print "[step3] emulate login baidu" ; postDict = { #'ppui_logintime': "", 'charset' : "utf-8" , #'codestring' : "", 'token' : tokenVal, #de3dbf1e8596642fa2ddf2921cd6257f 'isPhone' : "false" , 'index' : "0" , #'u' : "", #'safeflg' : "0", 'staticpage' : staticpage, #http%3A%2F%2Fwww.baidu.com%2Fcache%2Fuser%2Fhtml%2Fjump.html 'loginType' : "1" , 'tpl' : "mn" , 'callback' : "parent.bdPass.api.login._postCallback" , 'username' : username, 'password' : password, #'verifycode' : "", 'mem_pass' : "on" , }; loginRespHtml = getUrlRespHtml(baiduMainLoginUrl, postDict); cookiesToCheck = [ 'BDUSS' , 'PTOKEN' , 'STOKEN' , 'SAVEUSERID' ]; loginBaiduOK = checkAllCookiesExist(cookiesToCheck, cj); if (loginBaiduOK): print "+++ Emulate login baidu is OK, ^_^" ; else : print "--- Failed to emulate login baidu !" else : print "Fail to extract token value from html=" ,getapiRespHtml; if __name__ = = "__main__" : emulateLoginBaidu(); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | D:\tmp\tmp_dev_root\python\emulate_login_baidu_python>emulate_login_baidu_python.py -u crifan -p xxxxxx Function: Used to demostrate how to use Python code to emulate login baidu main page: http: //www .baidu.com/ Usage: emulate_login_baidu_python.py -u yourBaiduUsername -p yourBaiduPassword -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- [preparation] using cookieJar & HTTPCookieProcessor to automatically handle cookies -------------------------------------------------------------------------------- [step1] to get cookie BAIDUID [ 0 ] <Cookie BAIDUID=8D85C6528FDF7B5F49C746A18524495B:FG=1 for .baidu.com/> -------------------------------------------------------------------------------- [step2] to get token value tokenVal= 4d3f004bbe3e6f0cfa435abd38dd9fec -------------------------------------------------------------------------------- [step3] emulate login baidu +++ Emulate login baidu is OK, ^_^ |