#coding=utf-8 import urllib2 import urllib import sys import re #from selenium import webdriver #from selenium.webdriver.common.keys import Keys import time
#url = "href = "http://www.baidu.com/link?url=bu4fsa-txw7aHhz0LEu-Ej8ON__uS6btmV_mo7nI2O0_qKtfc-3rJHSyXnYOINHSgDASX4R1V6GcjE2UBGFdjZ9ahmEbG2gsGGW6MVW7pQm"" #print url pattern = re.compile(r"href = "( http://www.baidu.com/link?url=.+?)"") #rehh = re.findall(pattern, url)
#for i in rehh: #print i
with open('data.txt','a+') as f: key_word = [] with open('key_word.txt','r') as kf: for line in kf: request = urllib2.Request('http://www.baidu.com/s?wd='+line.decode('gbk').encode('utf-8')+'&pn=0') response = urllib2.urlopen(request)
#print response.read()
#pattern = re.compile(r"href = \"(.+?)\"")
rehh = re.findall(pattern, response.read())
for i in rehh:
request2 = urllib2.Request(i)
response2 = urllib2.urlopen(request2)
print response2.geturl()
f.write(response2.geturl())
f.write('\n')
f.close() kf.close()
1
cyrbuzz 2017-12-06 19:04:10 +08:00
排版感人。
|
2
shawndev 2017-12-07 11:11:24 +08:00
selenium
|
3
shawndev 2017-12-07 11:12:19 +08:00
pn=0,pn 即 pagenumber
|