import requests from lxml import etree import time, random from random import choice
def get_proxy(): url = 'http://svip.kuaidaili.com/api/getproxy/?orderid=&num=1&b_pcchrome=1&b_pcie=1&b_pcff=1&protocol=1&method=2&an_an=1&an_ha=1&quality=2&sep=1' proxy_temp = requests.get(url=url, timeout=1).text
proxy = {'http':'http://{}'.format(proxy_temp)}
if requests.get(url='http://nj.58.com/chuzu/?key=%E7%A7%9F%E6%88%BF', proxies=proxy).status_code == 200:
return proxy
else:
get_proxy()
def crawl(): frist_url = 'http://nj.58.com/chuzu'
headers = [{'User-Agent':'Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50'}, {'User-Agent':'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50'}, {'User-Agent':'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0;'}, {'User-Agent':'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1'}, {'User-Agent':'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11'}, {'User-Agent':'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Maxthon2.0)'}, {'User-Agent':'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;360SE)'}, {'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'}]
s = requests.session()
s.keep_alive = False
try:
resp = requests.get(url=frist_url, timeout=0.5).text
except requests.exceptions.RequestException as e:
print(e)
attr = etree.HTML(resp)
max_page = attr.xpath('//div[@class="pager"]/a/span/text()')[-2]
for page in range(1, int(max_page)+1):
next_url = frist_url + "/pn" + str(page)
response = requests.get(url=next_url, proxies=get_proxy(), timeout=1, headers=random.choice(headers)).text
attr = etree.HTML(response)
detail_urls = attr.xpath('//ul[@class="listUl"]/li/div[@class="img_list"]/a/@href')
for detail_url in detail_urls:
time.sleep(random.random()*3)
try:
s = requests.session()
s.keep_alive = False
r = requests.get(url=detail_url, proxies=get_proxy(), timeout=1, headers=random.choice(headers)).text
except requests.exceptions.RequestException as e:
print(e)
html = etree.HTML(r)
if "pinpaigongyu" in detail_url:
phone = str(html.xpath('//div[@class="phonenum getPrivateCallBtnStyle"]/text()'))
rent_type = html.xpath('//div[@class="housedetail center cf"]/h2/text()')[0].split()[0].split('] ')[0].split(' [')[1]
area = html.xpath('//ul[@class="house-info-list"]/li[1]/span/text()')[0].split()[0]+"平"
room_type = html.xpath('//ul[@class="house-info-list"]/li[2]/span/text()')[0].split()[0]
addres = html.xpath('//ul[@class="house-info-list"]/li[4]/span/text()')[0].strip()
traffic = str(html.xpath('//ul[@class="house-info-list"]/li[5]/span/text()'))
pictures = html.xpath('//ul[@id="pic-list"]/li/img/@lazy_src')
house_description = html.xpath('//p[@id="desc"]/text()')[0].replace(' ','')
print(phone)
else:
phone = str(html.xpath('//div[@class="house-chat-phonenum"]/p[@class="phone-num"]/text()'))
rent_type = html.xpath('//ul[@class="f14"]/li[1]/span[2]/text()')[0].split('-')[0]
area = html.xpath('//ul[@class="f14"]/li[2]/span[2]/text()')[0].split()[1]+"平"
room_type = html.xpath('//ul[@class="f14"]/li[2]/span[2]/text()')[0].split()[0]
addres = html.xpath('//ul[@class="f14"]/li[6]/span[2]/text()')[0].strip()
traffic = str(html.xpath('//ul[@class="f14"]/li[5]/em/text()'))
pictures = html.xpath('//ul[@id="housePicList"]/li/img/@lazy_src')
house_description = str(html.xpath('//ul[@class="introduce-item"]/li[2]/span[@class="a2"]//text()')).strip()
print(phone)
if name == 'main': crawl()
1
U87 OP 还是出现 requests.exceptions.ProxyError 难道是在验证和爬目标网站这时间之间 ip 失效了?
|