def parse_html(html): soup = BeautifulSoup(html, 'lxml') movie_list_soup = soup.find('ol', attrs = {'class': 'grid_view'}) movie_name_list = []
for movie_li in movie_list_soup.find_all('li'):
detail = movie_li.find('div', attrs = {'class': 'hd'})
movie_name = detail.find('span', attrs = {'class': 'title'}).getText()
movie_name_list.append(movie_name)
print(movie_name_list)
next_page = soup.find('span', attrs = {'class': 'next'}).find('a')
if next_page:
return movie_name_list, DOWNLOAD_URL + next_page['href']
return movie_name_list, None
1
sean10 2017-12-22 23:38:09 +08:00 via iPhone
倒数第二行不是返回了下一页的 url 么,在其他函数里有写继续爬的逻辑吧
|
2
lihongjie0209 2017-12-22 23:55:52 +08:00
if next_page:
// 返回一个 tuple(movie_name_list, next_page_url) return movie_name_list, DOWNLOAD_URL + next_page['href'] return movie_name_list, None 这个应该是链接提取函数, 如果有下一页就把下一页的链接作为 tuple 的第二项返回. |
3
andmspy OP |
4
lihongjie0209 2017-12-24 21:58:07 +08:00
@andmspy 这个是链接提取函数, 不是下载函数, 用这个函数提取链接之后再下载
|
5
andmspy OP @lihongjie0209
因为‘ DOWNLOAD_URL + next_page['href'] ’ 这个就是下一页的链接所以在 def 函数里面,return 就是可以实现链接下一页,是这个意思么? |