org-page

static site generator

如何抓取中国人民银行调查统计司的数据

友人找我解决个问题,用python抓取中国人民银行调查统计司的一个页面(http://www.pbc.gov.cn/diaochatongjisi/116219/116319/index.html)的数据。原本以为这会是一个极其简单的问题。然而却碰到了一个磕绊。

首先用惯用手法, curl 看下裸的内容。

$ curl -i  http://www.pbc.gov.cn/diaochatongjisi/116219/116319/index.html

HTTP/1.1 200 OK
Server: 360wzws
Date: Wed, 19 Oct 2016 15:04:28 GMT
Content-Type: text/html
Transfer-Encoding: chunked
Connection: keep-alive
Set-Cookie: wzwsconfirm=c638a799682c4ef7c292c212e0fc23db; path=/
Set-Cookie: wzwsvtime=1476889468; path=/

<html>
<head>
</head>
<body>
<noscript>
<h1><strong>请开启JavaScript并刷新该页.</strong></h1>
</noscript>
<script type="text/javascript">
eval(function(p,a,c,k,e,d){e=function(c){return(c<a?'':e(parseInt(c/a)))+((c=c%a)>32?String.fromCharCode(c+32):c.toString(33))};if(!''.replace(/^/,String)){while(c--)d[e(c)]=k[c]||e(c);k=[function(e){return d[e]}];e=function(){return'\\w+'};c=1};while(c--)if(k[c])p=p.replace(new RegExp('\\b'+e(c)+'\\b','g'),k[c]);return p}('15 D="k";15 1a="i";15 1b="l";15 11=d;15 F = "e+/=";J g(10) {15 U, N, R;15 o, p, q;R = 10.S;N = 0;U = "";17 (N < R) {o = 10.s(N++) & 6;O (N == R) {U += F.r(o >> a);U += F.r((o & 1) << b);U += "==";n;}p = 10.s(N++);O (N == R) {U += F.r(o >> a);U += F.r(((o & 1) << b) | ((p & 5) >> b));U += F.r((p & 4) << a);U += "=";n;}q = 10.s(N++);U += F.r(o >> a);U += F.r(((o & 1) << b) | ((p & 5) >> b));U += F.r(((p & 4) << a) | ((q & 3) >> c));U += F.r(q & 2);}W U;}J H(){15 16= 19.Q||B.C.u||B.m.u;15 K= 19.P||B.C.t||B.m.t;O (16*K <= 8) {W 14;}15 1d = 19.Y;15 1e = 19.Z;O (1d + 16 <= 0 || 1e + K <= 0 || 1d >= 19.X.18 || 1e >= 19.X.M) {W 14;}W G;}J h(){15 12 = 1a+1b;15 L = 0;15 N    = 0;I(N = 0; N < 12.S; N++) {L += 12.s(N);}L *= 9;L += 7;W "j"+L;}J f(){O(H()) {} E {15 A = ""; A = "1c="+g(11.13()) + "; V=/";B.w = A; 15 v = h();A = "1a="+g(v.13()) + "; V=/";B.w = A; 19.T=D;}}f();',59,74,'0|0x3|0x3f|0xc0|0xf|0xf0|0xff|111111|120000|19|2|4|6|7|ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789|HXXTTKKLLPPP5|KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU|QWERTASDFGXYSF|RANDOMSTR11682|WZWS_CONFIRM_PREFIX_LABEL7|/diaochatongjisi/116219/116319/index.html|STRRANDOM11682|body|break|c1|c2|c3|charAt|charCodeAt|clientHeight|clientWidth|confirm|cookie|cookieString|document|documentElement|dynamicurl|else|encoderchars|false|findDimensions|for|function|h|hash|height|i|if|innerHeight|innerWidth|len|length|location|out|path|return|screen|screenX|screenY|str|template|tmp|toString|true|var|w|while|width|window|wzwschallenge|wzwschallengex|wzwstemplate|x|y'.split('|'),0,{}))
</script>

</body>
</html>

shit! 没有数据。也许是js里面渲染出来的。于是用 chrome 打开页面,查看页面源代码,我看到的东西是这样的:

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html>
 <head>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><title>统计数据</title>
<meta  name="页面生成时间" content="2016-09-29 17:15:53" />
<meta  name="缓存清理时间" content="2016-09-02 17:42:14"/>
<meta  name="easysite版本" content="7.9.5"/>
<meta name="keywords"  content="" />
<meta name="description"  content="统计数据" />
............
<!--导航 -->
<div id="logo"> 
  <div class="logo2">
    <table width="950" height="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td width="119" align="center" valign="middle" id="biaotou" style="background-image:url(/eportal/fileDir/rmyh/template/common/header/gongkai1.jpg);">信息公开</td>
          <td width="831" style="background-image:url(/eportal/fileDir/rmyh/template/common/header/gongkai2.jpg);background-repeat:repeat-x;"><table width="100%" height="100%" cellpadding="0" cellspacing="0" border="0" class="cs">
              <tr>
                <td width="83"><a href="/goutongjiaoliu/113456/113469/index.html"  target="_parent">新闻发布</a></td>
                <td width="83"><a href="/tiaofasi/144941/index.html" target="_parent">法律法规</a></td>
                <td width="83"><a href="/rmyh/105145/index.html"  target="_parent">货币政策</a></td>
                <td width="83"><a href="/jinrongshichangsi/147160/147289/index.html"  target="_parent">信贷政策</a></td>
                <td width="83"><a href="/jinrongshichangsi/147160/147171/index.html"  target="_parent">金融市场</a></td>
                <td width="83"><a href="/jinrongwendingju/146766/index.html"  target="_parent">金融稳定</a></td>
                <td width="83"><a href="/diaochatongjisi/116219/index.html" target="_parent">调查统计</a></td>
                <td width="83"><a href="/kuaijicaiwusi/145920/index.html"  target="_parent">银行会计</a></td>
                <td width="83"><a href="/zhifujiesuansi/128525/index.html"  target="_parent">支付体系</a></td>
                <td width="84" style="border-right-style:none;"><a href="/kejisi/146812/index.html"  target="_parent">金融科技</a></td>
              </tr>
              <tr>
                <td colspan="10" style="border-right-style:none; height:6px;"></td>
              </tr>
              <tr>
                <td width="83"><a href="/huobijinyinju/147948/index.html"  target="_parent">人民币</a></td>
                <td width="83"><a href="/guokuju/136142/index.html"  target="_parent">经理国库</a></td>
                <td width="83"><a href="/goujisi/144449/index.html"  target="_parent">国际交往</a></td>
                <td width="83"><a href="/renshisi/144501/144513/index.html"  target="_parent">人员招录</a></td>
                <td width="83"><a href="/yanjiuju/124427/index.html"  target="_parent">金融研究</a></td>
                <td width="83"><a href="/zhengxinguanliju/128332/index.html"  target="_parent">征信管理</a></td>
                <td width="83"><a href="/fanxiqianju/135153/index.html"  target="_parent">反洗钱</a></td>
                <td width="83"><a href="/dangweixuanchuanbu/110698/index.html" target="_parent">党建工作</a></td>
                <td width="83"><a href="/gonghui/147897/index.html" >工会工作</a></td>
                <td width="84" style="border-right-style:none;width:82px;"><a href="/kejisi/146812/146832/index.html" >金融标准化</a></td>
              </tr>
            </table></td>
        </tr>
      </table>
  </div>
.............
</div><div style="display:none" easysite="easysiteHiddenDiv">
<input type="hidden"  id="currentLoginUserLoginName"/>
<input type="hidden"  id="currentLoginUserLoginId"/>
</div> </body>
</html>

shit! 也许是http头里面有什么花样。于是用 chrome 调试工具里的 Copy as cURL 把浏览器发送的全部的头都拿到,得到的命令是这样的。

curl 'http://www.pbc.gov.cn/diaochatongjisi/116219/116319/index.html' -H 'If-None-Match: W/"11827-d359-53da1eca7c440"' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: zh,en-US;q=0.8,en;q=0.6' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://www.pbc.gov.cn/diaochatongjisi/116219/116319/index.html' -H 'Cookie: sdfsdf; wzwsconfirm=b3ae4367ad93720e06c784de08ac826b; wzwstemplate=Mw==; ccpassport=6068335aa29d19017e67012d53b65ea5; wzwschallenge=-1; wzwsvtime=1476889531; _gscu_1042262807=76862674r67d3p17; _gscs_1042262807=t768895303f1std16|pv:1; _gscbrs_1042262807=1' -H 'Connection: keep-alive' -H 'If-Modified-Since: Thu, 29 Sep 2016 09:15:53 GMT' -H 'Cache-Control: max-age=0' --compressed

一点一点去除无用的头,最后得到的命令是这样的。

curl 'http://www.pbc.gov.cn/diaochatongjisi/116219/116319/index.html' -H 'Cookie: ccpassport=6068335aa29d19017e67012d53b65ea5;'

所以这个头里面的 ccpassport 就是重点。

一般来说cookie会在第一次访问这个网站的时候,服务器会发送 Set-Cookie 头给客户端,回看第一条命令的输出结果,的确有 Set-Cookie ,但是没有 ccpassport ,于是猜测就是第一个命令得到的一堆js,通过访问某个网址获取了 ccpassport cookie的。于是仔细看了下这个js。

eval(function(p,a,c,k,e,d){e=function(c){return(c<a?'':e(parseInt(c/a)))+((c=c%a)>32?String.fromCharCode(c+32):c.toString(33))};if(!''.replace(/^/,String)){while(c--)d[e(c)]=k[c]||e(c);k=[function(e){return d[e]}];e=function(){return'\\w+'};c=1};while(c--)if(k[c])p=p.replace(new RegExp('\\b'+e(c)+'\\b','g'),k[c]);return p}('15 D="k";15 1a="i";15 1b="l";15 11=d;15 F = "e+/=";J g(10) {15 U, N, R;15 o, p, q;R = 10.S;N = 0;U = "";17 (N < R) {o = 10.s(N++) & 6;O (N == R) {U += F.r(o >> a);U += F.r((o & 1) << b);U += "==";n;}p = 10.s(N++);O (N == R) {U += F.r(o >> a);U += F.r(((o & 1) << b) | ((p & 5) >> b));U += F.r((p & 4) << a);U += "=";n;}q = 10.s(N++);U += F.r(o >> a);U += F.r(((o & 1) << b) | ((p & 5) >> b));U += F.r(((p & 4) << a) | ((q & 3) >> c));U += F.r(q & 2);}W U;}J H(){15 16= 19.Q||B.C.u||B.m.u;15 K= 19.P||B.C.t||B.m.t;O (16*K <= 8) {W 14;}15 1d = 19.Y;15 1e = 19.Z;O (1d + 16 <= 0 || 1e + K <= 0 || 1d >= 19.X.18 || 1e >= 19.X.M) {W 14;}W G;}J h(){15 12 = 1a+1b;15 L = 0;15 N    = 0;I(N = 0; N < 12.S; N++) {L += 12.s(N);}L *= 9;L += 7;W "j"+L;}J f(){O(H()) {} E {15 A = ""; A = "1c="+g(11.13()) + "; V=/";B.w = A; 15 v = h();A = "1a="+g(v.13()) + "; V=/";B.w = A; 19.T=D;}}f();',59,74,'0|0x3|0x3f|0xc0|0xf|0xf0|0xff|111111|120000|19|2|4|6|7|ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789|HXXTTKKLLPPP5|KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU|QWERTASDFGXYSF|RANDOMSTR11682|WZWS_CONFIRM_PREFIX_LABEL7|/diaochatongjisi/116219/116319/index.html|STRRANDOM11682|body|break|c1|c2|c3|charAt|charCodeAt|clientHeight|clientWidth|confirm|cookie|cookieString|document|documentElement|dynamicurl|else|encoderchars|false|findDimensions|for|function|h|hash|height|i|if|innerHeight|innerWidth|len|length|location|out|path|return|screen|screenX|screenY|str|template|tmp|toString|true|var|w|while|width|window|wzwschallenge|wzwschallengex|wzwstemplate|x|y'.split('|'),0,{}))

shit! 压缩过的代码。

js的 eval 是接收一个字符串,并将其如代码般执行一遍,所以 eval 里面的表达式的返回值应该是一个字符串,于是用 console.log 替换 eval ,在 chrome 调试终端中执行,得到结果如下。

var dynamicurl="/diaochatongjisi/116219/116319/index.html";var wzwschallenge="RANDOMSTR11682";var wzwschallengex="STRRANDOM11682";var template=7;var encoderchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";function KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU(str) {var out, i, len;var c1, c2, c3;len = str.length;i = 0;out = "";while (i < len) {c1 = str.charCodeAt(i++) & 0xff;if (i == len) {out += encoderchars.charAt(c1 >> 2);out += encoderchars.charAt((c1 & 0x3) << 4);out += "==";break;}c2 = str.charCodeAt(i++);if (i == len) {out += encoderchars.charAt(c1 >> 2);out += encoderchars.charAt(((c1 & 0x3) << 4) | ((c2 & 0xf0) >> 4));out += encoderchars.charAt((c2 & 0xf) << 2);out += "=";break;}c3 = str.charCodeAt(i++);out += encoderchars.charAt(c1 >> 2);out += encoderchars.charAt(((c1 & 0x3) << 4) | ((c2 & 0xf0) >> 4));out += encoderchars.charAt(((c2 & 0xf) << 2) | ((c3 & 0xc0) >> 6));out += encoderchars.charAt(c3 & 0x3f);}return out;}function findDimensions(){var w= window.innerWidth||document.documentElement.clientWidth||document.body.clientWidth;var h= window.innerHeight||document.documentElement.clientHeight||document.body.clientHeight;if (w*h <= 120000) {return true;}var x = window.screenX;var y = window.screenY;if (x + w <= 0 || y + h <= 0 || x >= window.screen.width || y >= window.screen.height) {return true;}return false;}function QWERTASDFGXYSF(){var tmp = wzwschallenge+wzwschallengex;var hash = 0;var i    = 0;for(i = 0; i < tmp.length; i++) {hash += tmp.charCodeAt(i);}hash *= 19;hash += 111111;return "WZWS_CONFIRM_PREFIX_LABEL7"+hash;}function HXXTTKKLLPPP5(){if(findDimensions()) {} else {var cookieString = "";  cookieString = "wzwstemplate="+KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU(template.toString()) + "; path=/";document.cookie = cookieString; var confirm = QWERTASDFGXYSF();cookieString = "wzwschallenge="+KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU(confirm.toString()) + "; path=/";document.cookie = cookieString;  window.location=dynamicurl;}}HXXTTKKLLPPP5();

google 搜索了个js unminify 的在线工具,得到了下面的结果:

 1: var dynamicurl = "/diaochatongjisi/116219/116319/index.html";
 2: var wzwschallenge = "RANDOMSTR11682";
 3: var wzwschallengex = "STRRANDOM11682";
 4: var template = 7;
 5: var encoderchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
 6: 
 7: function KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU(str) {
 8:     var out, i, len;
 9:     var c1, c2, c3;
10:     len = str.length;
11:     i = 0;
12:     out = "";
13:     while (i < len) {
14:         c1 = str.charCodeAt(i++) & 0xff;
15:         if (i == len) {
16:             out += encoderchars.charAt(c1 >> 2);
17:             out += encoderchars.charAt((c1 & 0x3) << 4);
18:             out += "==";
19:             break;
20:         }
21:         c2 = str.charCodeAt(i++);
22:         if (i == len) {
23:             out += encoderchars.charAt(c1 >> 2);
24:             out += encoderchars.charAt(((c1 & 0x3) << 4) | ((c2 & 0xf0) >> 4));
25:             out += encoderchars.charAt((c2 & 0xf) << 2);
26:             out += "=";
27:             break;
28:         }
29:         c3 = str.charCodeAt(i++);
30:         out += encoderchars.charAt(c1 >> 2);
31:         out += encoderchars.charAt(((c1 & 0x3) << 4) | ((c2 & 0xf0) >> 4));
32:         out += encoderchars.charAt(((c2 & 0xf) << 2) | ((c3 & 0xc0) >> 6));
33:         out += encoderchars.charAt(c3 & 0x3f);
34:     }
35:     return out;
36: }
37: 
38: function findDimensions() {
39:     var w = window.innerWidth || document.documentElement.clientWidth || document.body.clientWidth;
40:     var h = window.innerHeight || document.documentElement.clientHeight || document.body.clientHeight;
41:     if (w * h <= 120000) {
42:         return true;
43:     }
44:     var x = window.screenX;
45:     var y = window.screenY;
46:     if (x + w <= 0 || y + h <= 0 || x >= window.screen.width || y >= window.screen.height) {
47:         return true;
48:     }
49:     return false;
50: }
51: 
52: function QWERTASDFGXYSF() {
53:     var tmp = wzwschallenge + wzwschallengex;
54:     var hash = 0;
55:     var i = 0;
56:     for (i = 0; i < tmp.length; i++) {
57:         hash += tmp.charCodeAt(i);
58:     }
59:     hash *= 19;
60:     hash += 111111;
61:     return "WZWS_CONFIRM_PREFIX_LABEL7" + hash;
62: }
63: 
64: function HXXTTKKLLPPP5() {
65:     if (findDimensions()) {} else {
66:         var cookieString = "";
67:         cookieString = "wzwstemplate=" + KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU(template.toString()) + "; path=/";
68:         document.cookie = cookieString;
69:         var confirm = QWERTASDFGXYSF();
70:         cookieString = "wzwschallenge=" + KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU(confirm.toString()) + "; path=/";
71:         document.cookie = cookieString;
72:         window.location = dynamicurl;
73:     }
74: }
75: HXXTTKKLLPPP5();

只看最后一个看书 HXXTTKKLLPPP5 ,通过几个方法,设置了cookie,然后再重新访问了当前网址。打开 node ,目测直接运行这些代码必然,会有问题,于是稍微开了下,给 node 环境造了一个虚假的浏览器环境。

 1: var window = {innerWidth: 1280, innerHeight: 800, screenX: 0, screenY: 0, screen: {width: 1280, height: 800}};
 2: var document = (function(){
 3:     var cookies = [];
 4:     return {
 5:         get cookie() {
 6:             return cookies;
 7:         },
 8:         set cookie(c) {
 9:             cookies.push(c);
10:         }
11:     }
12: })();
13: eval(function(p,a,c,k,e,d){e=function(c){return(c<a?'':e(parseInt(c/a)))+((c=c%a)>32?String.fromCharCode(c+32):c.toString(33))};if(!''.replace(/^/,String)){while(c--)d[e(c)]=k[c]||e(c);k=[function(e){return d[e]}];e=function(){return'\\w+'};c=1};while(c--)if(k[c])p=p.replace(new RegExp('\\b'+e(c)+'\\b','g'),k[c]);return p}('15 D="k";15 1a="i";15 1b="l";15 11=d;15 F = "e+/=";J g(10) {15 U, N, R;15 o, p, q;R = 10.S;N = 0;U = "";17 (N < R) {o = 10.s(N++) & 6;O (N == R) {U += F.r(o >> a);U += F.r((o & 1) << b);U += "==";n;}p = 10.s(N++);O (N == R) {U += F.r(o >> a);U += F.r(((o & 1) << b) | ((p & 5) >> b));U += F.r((p & 4) << a);U += "=";n;}q = 10.s(N++);U += F.r(o >> a);U += F.r(((o & 1) << b) | ((p & 5) >> b));U += F.r(((p & 4) << a) | ((q & 3) >> c));U += F.r(q & 2);}W U;}J H(){15 16= 19.Q||B.C.u||B.m.u;15 K= 19.P||B.C.t||B.m.t;O (16*K <= 8) {W 14;}15 1d = 19.Y;15 1e = 19.Z;O (1d + 16 <= 0 || 1e + K <= 0 || 1d >= 19.X.18 || 1e >= 19.X.M) {W 14;}W G;}J h(){15 12 = 1a+1b;15 L = 0;15 N    = 0;I(N = 0; N < 12.S; N++) {L += 12.s(N);}L *= 9;L += 7;W "j"+L;}J f(){O(H()) {} E {15 A = ""; A = "1c="+g(11.13()) + "; V=/";B.w = A; 15 v = h();A = "1a="+g(v.13()) + "; V=/";B.w = A; 19.T=D;}}f();',59,74,'0|0x3|0x3f|0xc0|0xf|0xf0|0xff|111111|120000|19|2|4|6|7|ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789|HXXTTKKLLPPP5|KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU|QWERTASDFGXYSF|RANDOMSTR11682|WZWS_CONFIRM_PREFIX_LABEL7|/diaochatongjisi/116219/116319/index.html|STRRANDOM11682|body|break|c1|c2|c3|charAt|charCodeAt|clientHeight|clientWidth|confirm|cookie|cookieString|document|documentElement|dynamicurl|else|encoderchars|false|findDimensions|for|function|h|hash|height|i|if|innerHeight|innerWidth|len|length|location|out|path|return|screen|screenX|screenY|str|template|tmp|toString|true|var|w|while|width|window|wzwschallenge|wzwschallengex|wzwstemplate|x|y'.split('|'),0,{}));
14: document.cookie;

在node.js里面跑一遍,获得下面的结果:

[ 
    'wzwstemplate=Nw==; path=/',
    'wzwschallenge=V1pXU19DT05GSVJNX1BSRUZJWF9MQUJFTDcxNDc0Mzk=; path=/' 
]

在第二次访问那个地址的时候,浏览器是带着上面两个cookie,以及原先的cookie一起访问了,那个网址,所以才能得到正确的内容。所以这个网站的爬取思路,就是:

  1. 不带任何cookie访问网址
  2. 运行返回的js,并获得两个cookie,并和第1步的cookie一起,访问网址
  3. 获得结果

不像java,python貌似没有js引擎,网上搜了下,有人做了个 PyExecJS 这个库函,可以调用各个平台自身的js引擎,从而在python中运行js。它支持的js运行环境有如下:

这里用 PyExecJS, Beautiful Soup, Requests, 写一个简单的抓取脚本

Comments

comments powered by Disqus