yanran8385
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author: Administrator # @Date: 2015-10-31 15:45:27 # @Last Modified by: Administrator # @Last Modified time: 2015-11-23 16:57:31 import requests import sys import json import re reload(sys) ('utf-8') #获取到匹配字符的字符串 def find(pattern,test): finder = (pattern, test) start = () end = () return test[start:end-1] cookies = { '_ga':'', '_za':'8d570b05-b0b1-4c96-a441-faddff34', 'q_c1':'23ddd234234', '_xsrf':'234id':'"ZTE3NWY2ZTsdfsdfsdfWM2YzYxZmE=|1446435757|15fef3b84e044c122ee0fe8959e606827d333134"', 'z_c0':'"QUFBQXhWNGZsdfsdRvWGxaeVRDMDRRVDJmSzJFN1JLVUJUT1VYaEtZYS13PT0=|14464e234767|57db366f67cc107a05f1dc8237af24b865573cbe5"', '__utmt':'1', '__utma':'', '__utmb':'', '__utmc':'51123390', '__utmz':'|utmcgcn=(referral)|utmcmd=referral|utmcct=/', '__utmv':'|2=registration_date=2028=1^3=entry_date=201330318=1'} headers = {'user-agent': 'Mozilla/ (Windows NT ; WOW64) AppleWebKit/ (KHTML, like Gecko) Chrome/ Safari/', 'referer':'', 'host':'','Origin':'', 'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8', 'Connection':'keep-alive','X-Requested-With':'XMLHttpRequest','Content-Length':'81', 'Accept-Encoding':'gzip,deflate','Accept-Language':'zh-CN,zh;q=','Connection':'keep-alive' } #多次访问之后,其实一加载时加载20个问题,具体参数传输就是offset,以20递增 dicc = {"offset":60} n=20 b=0 # 与爬取图片相同的是,往下拉的时候也会发送http请求返回json数据,但是不同的是,像模拟登录首页不同的是除了 # 发送form表单的那些东西后,知乎是拒绝了我的请求了,刚开始以为是headers上的拦截,往headers添加浏览器 # 访问是的headers那些信息添加上,发现还是拒绝访问。 #想了一下,应该是cookie原因。这个加载的请求和模拟登录首页不同 #所以补上其他的cookies信息,再次请求,请求成功。 for x in xrange(20,460,20): n = n+20 b = b+20 dicc['offset'] = x formdata = {'method':'next','params':'{"offset":20}','_xsrf':'20770d88051f0f45e941570645f5e2e6'} #传输需要json串,和python的字典是有区别的,需要转换 formdata['params'] = (dicc) # print (dicc) # print dicc circle = ("", cookies=cookies,data=formdata,headers=headers) #response内容 其实爬过一次之后就大同小异了。 都是 #问题返回的json串格式 # {"r":0, # "msg": ["