1
watsy0007 2018-10-09 16:22:08 +08:00
```python
class MongoCache: db = None def __init__(self): if not hasattr(MongoCache, 'pool'): MongoCache.create_instance() @staticmethod def create_instance(): client = MongoClient(config.MONGO_URL) MongoCache.db = client['spider'] def create(self, table, unique_key, origin_data): if self.exists(table, unique_key): return None summaries = {k: generator_summary(v) for (k, v) in origin_data.items()} return self.db[table].insert({ 'unique_key': unique_key, 'data': origin_data, 'summaries': summaries }) def get(self, table, unique_key): data = self.db[table].find_one({'unique_key': unique_key}) if data is None: return None return data['data'] def exists(self, table, unique_key): data = self.db[table].find_one({'unique_key': unique_key}) return data is not None def is_changed(self, table, unique_key, origin_data): if not self.exists(table, unique_key): return True last_summaries = self.db[table].find_one({'unique_key': unique_key})['summaries'] for (k, v) in origin_data.items(): summary = generator_summary(v) last_summary = last_summaries.get(k, None) # print('{} -> {} | {} -> {}'.format(k, v, summary, last_summary)) if last_summary is None or last_summary != summary: return True return False def change_fields(self, table, unique_key, origin_data): if not self.exists(table, unique_key): return origin_data changes = {} last_summaries = self.db[table].find_one({'unique_key': unique_key})['summaries'] for (k, v) in origin_data.items(): last_summary = last_summaries.get(k, None) # print('{} -> {} | {} -> {}'.format(k, v, summary, last_summary)) if last_summary is None or last_summary != generator_summary(v): changes[k] = v return changes def update(self, table, unique_key, origin_data): if not self.exists(table, unique_key): return origin_data new_summaries = {k: generator_summary(v) for (k, v) in origin_data.items()} self.db[table].update_one({'unique_key': unique_key}, {'$set': {'data': origin_data, 'summaries': new_summaries}}) return origin_data ``` |
2
watsy0007 2018-10-09 16:24:07 +08:00
v2ex 不支持 markdown...
https://gist.github.com/watsy0007/779c27fb0ceab283cc434b5eec10b7c4 封装了针对数据处理的公共方法. |
3
picone 2018-10-09 20:47:42 +08:00
我是直接 mongo 加 unique 索引,并捕捉索引冲突异常。。
|
5
picone 2018-10-12 13:33:01 +08:00
@picone #3 可以多个 key 做索引 https://docs.mongodb.com/manual/core/index-multikey/
|
6
Ewig OP @picone db.XiaoMiQuan.find()
{ "_id" : ObjectId("5bbf14dbc96b5b3f5627d11d"), "file_url" : "https://baogaocos.seedsufe.com/2018/07/19/doc_1532004923556.pdf", "name" : "AMCHAM-中国的“一带一路”:对美国企业的影响(英文)-2018.6-8 页.pdf" }我现在是这样写的 这是对的? |