@
alexapollo @
YUX @
likuku @
geek123 @
EchoUtopia @
Jblue 最后我放弃用多线程|多进程改这个爬虫了,还是没弄懂,打算多读一读各位列出的源码。
后面修改了一次爬虫,从逻辑上减少了一轮解析 HTML 的次数,也算是减少了爬取网页的时间:
1.fetchBooks(u'爬虫') 2.exportCsv(bookUrls)
解析页面分页的时候把 book 的详细页和翻页链接一次保存,上一个版本中为了得到他们 urlopen 了两次,比较浪费时间,另外用 global variable 来更新 book 详细页,翻页链接用递归来获取。
# -*- coding: UTF-8 -*-
import os
import re
import time
import json
import random
import urlparse
import unicodecsv as csv
from urllib2 import urlopen
from urllib2 import HTTPError
from bs4 import BeautifulSoup
import logging
logging.basicConfig(filename='douban.log', level=logging.DEBUG)
bookUrls = set()
def fetchBooks(start):
'''递归爬取翻页链接,同时获取该标签下所有书籍的 url'''
first = u'
https://book.douban.com/tag/' + start
newPage = findPages(first)
while newPage:
newPage = findPages(newPage)
print 'Scraping books on page {!r} done'.format(newPage)
logging.info('Scraping books on page {!r} done'.format(newPage))
time.sleep(random.randint(1, 10))
def exportCsv(books):
'''写书籍详细信息到 csv 文件'''
data = (download(book) for book in books)
with open(os.path.join(os.path.dirname(__file__), 'books.csv'), 'wb') as f:
# with open('books.csv', 'wb') as f:
writer = csv.writer(f)
headers = (u'书名', u'原书名', u'出版日期', u'页数',
u'豆瓣评分', u'评价人数', u'ISBN', u'网址', u'TOP 评论')
writer.writerow(headers)
for line in data:
writer.writerow(line)
print 'Saving the book {} done'.format(line[6])
logging.info('Saving the book {} done'.format(line[6]))
time.sleep(random.randint(1, 10))
print 'Saving ALL done'
logging.info('Saving ALL done')
def findPages(pageUrl):
'''解析豆瓣图书分页 html ,获取翻页按钮链接,每页一个链接'''
html = urlopen(iriToUri(pageUrl))
bsObj = BeautifulSoup(html)
linkEle = bsObj.find('link', {'rel': 'next'})
if linkEle is not None:
if 'href' in linkEle.attrs:
findBooks(bsObj)
return u'
https://book.douban.com' + linkEle.attrs['href']
def findBooks(bsObj):
'''解析豆瓣图书分页 html ,获取书籍详细页链接,每页 20 个链接'''
global bookUrls
books = bsObj.findAll('a', {'class': 'nbg'})
try:
if books is not None:
for book in books:
if 'href' in book.attrs and book.attrs['href'] not in bookUrls:
print 'Found new book: {}'.format(book.attrs['href'])
logging.info('Found new book: {}'.format(book.attrs['href']))
bookUrls.add(book.attrs['href'])
return bookUrls
except Exception as e:
print e.message
logging.exception('{}'.format(e))
def urlEncodeNonAscii(b):
"""将 non-ascii 转成 ascii 字符"""
return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)
def iriToUri(iri):
"""打开带中文的网址,将 iri 转为 uri ,"""
parts = urlparse.urlparse(iri)
return urlparse.urlunparse(
part.encode('idna') if parti == 1 else urlEncodeNonAscii(part.encode('utf-8'))
for parti, part in enumerate(parts)
)
def getFullReview(reviewId):
'''抓包解析 review 内容'''
url = '
https://book.douban.com/j/review/' + str(reviewId) + '/fullinfo'
try:
html = json.loads(urlopen(url).read())['html']
except HTTPError as e :
print e.message
logging.error('Error: {}'.format(e))
return None
fullReview = re.search('.*(?=<div)', html).group()
if fullReview is not None:
return fullReview
def download(bookUrl):
'''解析书籍详细页'''
html = urlopen(bookUrl)
bsObj = BeautifulSoup(html)
try:
isbn = bsObj.find(id='info').find(
text=re.compile('(\d{10})|(\d{13})')).strip()
except AttributeError as e:
print e.message
logging.exception('{}'.format(e))
isbn = ''
try:
publishY = bsObj.find(id='info').find(
text=re.compile('\d{4}-\d{1,2}(-\d{1,2})?')).strip()
except AttributeError as e:
print e.message
logging.exception('{}'.format(e))
publishY = ''
try:
pageNum = bsObj.find(id='info').find(
text=re.compile('^\s\d{3,4}$')).strip()
except AttributeError as e:
print e.message
logging.exception('{}'.format(e))
pageNum = ''
try:
origName = bsObj.find(id='info').find(text=u'原作名:')
if origName is not None:
origName = bsObj.find(id='info').find(
text=u'原作名:').parent.next_sibling.strip()
except AttributeError as e:
print e.message
logging.exception('{}'.format(e))
origName = ''
try:
rating = bsObj.find(
'strong', {'class': 'll rating_num '}).get_text().strip()
except AttributeError as e:
print e.message
logging.exception('{}'.format(e))
rating = ''
try:
numRating = bsObj.find(
'span', {'property': 'v:votes'}).get_text()
except AttributeError as e:
print e.message
logging.exception('{}'.format(e))
numRating = ''
try:
reviewId = bsObj.find(
'div', {'id': re.compile(r'tb-(\d+)')}).attrs['id'][3:]
review = getFullReview(reviewId)
except AttributeError as e:
print e.message
logging.exception('{}'.format(e))
review = ''
title = bsObj.find('span', {'property': 'v:itemreviewed'}).get_text()
addr = bookUrl
return (title, origName, publishY, pageNum, rating,
numRating, isbn, addr, review)
if __name__ == '__main__':
print 'Starting at: {}'.format(time.ctime())
logging.info('Starting at: {}'.format(time.ctime()))
fetchBooks(u'股票')
exportCsv(bookUrls)
print 'All finished at: {}'.format(time.ctime())
logging.info('All finished at: {}'.format(time.ctime()))