代码仅作纪念,所有权利保留喔
实际使用时还需要搭配 calibre 转换格式,不能使用 Email 直接发送 html 文件
#!/usr/bin/env python3
# coding = utf-8
import requests, sys, time, logging, tempfile
from bs4 import BeautifulSoup
DEBUG = False
class URLSet:
'URL 庫,獲取、處理、提供 URL'
def getAnyOnePost(self):
return 'https://cn.nytimes.com/china/20190118/china-population-crisis/'
def getDualPostURL(self, url):
return 'https://cn.nytimes.com/china/20190118/china-population-crisis/dual/'
@staticmethod
def getTestURL():
return 'https://cn.nytimes.com/asia-pacific/20190117/china-canada-schellenberg-death/dual/'
@staticmethod
def getalot():
result = []
strs = """https://cn.nytimes.com/china/20190118/china-population-crisis/dual/
https://cn.nytimes.com/asia-pacific/20190118/north-korea-missile-kim-jong-un/dual/
https://cn.nytimes.com/technology/20190118/huawei-investigation-trade-secrets/dual/
https://cn.nytimes.com/asia-pacific/20190117/china-canada-schellenberg-death/dual/
https://cn.nytimes.com/china/20190117/asian-cup-china-tattoos/dual/
https://cn.nytimes.com/usa/20190116/trump-inauguration-spending/dual/
https://cn.nytimes.com/opinion/20190118/germanys-china-problem/dual/
https://cn.nytimes.com/travel/20190118/what-to-do-in-rome-36-hours/dual/
https://cn.nytimes.com/opinion/20190116/will-chinas-economy-hit-a-great-wall/dual/
https://cn.nytimes.com/style/20190115/modern-love-end-of-marriage-google-maps/dual/
https://cn.nytimes.com/opinion/20190115/us-china-trade/dual/
https://cn.nytimes.com/asia-pacific/20190118/philippines-subic-bay-shipyard/dual/
https://cn.nytimes.com/education/20190117/the-gender-achievement-gap-starts-later-for-asian-american-students/dual/
https://cn.nytimes.com/style/20190117/slice-joint-pizza-new-york-city/dual/
https://cn.nytimes.com/culture/20190115/wod-furlough/"""
strs = strs.split('\n')
for i in strs:
result.append(i)
return result
class Spaker:
'將格式化的文章用目標格式封裝'
@staticmethod
def saveAsHTML(post, filename):
if filename is None:
filename = "temByNY2PDF.py"
with open('{}.html'.format(filename), 'w') as f:
f.write(post)
class newsProcessor:
'獲取新聞 URL 集,並處理成合適的格式送入 Spaker 保存爲文件以便 Kindle 閱讀'
def __init__(self, url):
self._url = url
self._doc = webCommissioner.getHTMLDoc(self._url)
def getContentStr(self):
soup = BeautifulSoup(self._doc, "lxml")
result = {'title': None, 'enTitle': '', 'date': None, 'contents': [], 'contentStr': ''}
result['title'] = soup.find('meta', property='og:title')['content']
result['enTitle'] = soup.find('h1', {'class': 'en-title'}).string
result['date'] = soup.find('meta', id='date')['content']
for p in soup.find_all('div', {'class': 'article-paragraph'}):
if p is None:
continue
result['contents'].append(p.get_text(strip=True))
result['contentStr'] = result['contentStr'] + ("{}\n".format(result['contents'][-1]))
return result
class webCommissioner:
'網路相關的一切工作,下載 html 檔、格式化 URL,等等'
@staticmethod
def getHTMLDoc(url):
html = requests.get(url, headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'},
timeout=2)
return html.text
class post:
def __init__(self, postDict):
self.title = postDict['title']
self.enTitle = postDict['enTitle']
self.date = postDict['date']
self.contents = postDict['contents']
self.contentsStr = postDict['contentStr']
str = ''
for p in self.contents:
str = str + "<p>{}</p>\n".format(p)
self.htmldoc = """<!DOCTYPE html>
<html lang="zh-cmn-Hans-CN">
<head>
<meta charset="UTF-8">
<meta http-equiv="Content-Language" content="zh-cmn-Hans-CN"/>
<meta name="author" content="Nytimes"/>
<title>{}</title>
</head>
<body>
<h1>{}</h1>
<h1 class="en-title">{}</h1>
{}
</body>
</html>""".format(self.title, self.title, self.enTitle, str)
if __name__ == '__main__':
for url in URLSet.getalot():
pro = newsProcessor(url)
pst = post(pro.getContentStr())
Spaker.saveAsHTML(pst.htmldoc, pst.title)