response 长这样 {'keyBoardList': [{'orgId': 'gssz0000001', 'category': 'A 股', 'plate': 'szse', 'code': '000001', 'pinyin': 'payh', 'zwjc': '平安银行'}], 'classifiedAnnouncements': [{'id': None, 'secCode': '000001', 'secName': '平安银行', 'orgId': 'gssz0000001', 'announcementId': '1212287014', 'announcementTitle': '平安银行股份有限公司关联交易公告', 'announcementTime': 1643212800000, 'adjunctUrl': 'finalpage/2022-01-27/1212287014.PDF', 'adjunctSize': 108, 'adjunctType': 'PDF', 'storageTime': None, 'columnId': '09020202||250101||251302', 'pageColumn': 'SZZB', 'announcementType': '01010503||010112||011719', 'associateAnnouncement': None, 'important': None, 'batchNum': None, 'announcementContent': '', 'orgName': None, 'announcementTypeName': None}, {'id': None, 'secCode': '000001', 'secName': '平安银行', 'orgId': 'gssz0000001', 'announcementId': '1212287015', 'announcementTitle': '平安银行股份有限公司关联交易公告', 'announcementTime': 1643212800000, 'adjunctUrl': 'finalpage/2022-01-27/1212287015.PDF', 'adjunctSize': 180, 'adjunctType': 'PDF', 'storageTime': None, 'columnId': '09020202||250101||251302', 'pageColumn': 'SZZB', 'announcementType': '01010503||010112||011719', 'associateAnnouncement': None, 'important': None, 'batchNum': None, 'announcementContent': '', 'orgName': None, 'announcementTypeName': None}, {'id': None, 'secCode': '000001', 'secName': '平安银行', 'orgId': 'gssz0000001', 'announcementId': '1212287017', 'announcementTitle': '平安银行股份有限公司关联交易公告', 'announcementTime': 1643212800000, 'adjunctUrl': 'finalpage/2022-01-27/1212287017.PDF', 'adjunctSize': 182, 'adjunctType': 'PDF', 'storageTime': None, 'columnId': '09020202||250101||251302', 'pageColumn': 'SZZB', 'announcementType': '01010503||010112||011719', 'associateAnnouncement': None, 'important': None, 'batchNum': None, 'announcementContent': '', 'orgName': None, 'announcementTypeName': None}, {'id': None, 'secCode': '000001', 'secName': '平安银行', 'orgId': 'gssz0000001', 'announcementId': '1212287016', 'announcementTitle': '平安银行股份有限公司关联交易公告', 'announcementTime': 1643212800000, 'adjunctUrl': 'finalpage/2022-01-27/1212287016.PDF', 'adjunctSize': 145, 'adjunctType': 'PDF', 'storageTime': None, 'columnId': '09020202||250101||251302', 'pageColumn': 'SZZB', 'announcementType': '01010503||010112||011719', 'associateAnnouncement': None, 'important': None, 'batchNum': None, 'announcementContent': '', 'orgName': None, 'announcementTypeName': None}, {'id': None, 'secCode': '000001', 'secName': '平安银行', 'orgId': 'gssz0000001', 'announcementId': '1212287018', 'announcementTitle': '平安银行股份有限公司独立董事独立意见', 'announcementTime': 1643212800000, 'adjunctUrl': 'finalpage/2022-01-27/1212287018.PDF', 'adjunctSize': 199, 'adjunctType': 'PDF', 'storageTime': None, 'columnId': '09020202||250101||251302', 'pageColumn': 'SZZB', 'announcementType': '01011107||010112||01310565', 'associateAnnouncement': None, 'important': None, 'batchNum': None, 'announcementContent': '', 'orgName': None, 'announcementTypeName': None}]}
然后我取
r = requests.post(url, headers=hd, data=data)
print(r.json())
#print(r.json()['keyBoardList'])
org_id = r.json()['keyBoardList'][0]['orgId']
总是提示我 out of range
print(r.json())会多出一个[].感觉有点奇怪啊
请教,这个有没有好的办法呢
1
binux 2022-02-05 01:03:21 +08:00 via Android
JSON 是双引号
|
2
sjmcefc2 OP 哎呀,还真是,我这个是 request 回来的 response ,应该是 json 啊
|
3
ooops 2022-02-05 02:28:09 +08:00 via iPhone
keyboard b 小写
|
4
coldear 2022-02-05 02:47:16 +08:00
不带引号的`None` 是啥
|
5
ysc3839 2022-02-05 03:14:40 +08:00
发原始数据看看吧,根据你提供的 response 是没有问题的
|
6
mikewang 2022-02-05 03:14:49 +08:00 via iPhone
笑死,单引号和 None ,这不就是 Python 的 repr
这玩意不是 JSON , 用 from ast import literal_eval 吧 |
7
guoqiao 2022-02-05 07:22:02 +08:00 1
所谓 json, 它跟 html/markdown/xml/yaml 一样,是一种有特定格式的**文本**,本质上它是字符串。
request.json() 方法,就是把 json 字符串转换成 python object, 通常是 dict, list, 相当于是 json.loads(response.text). json 格式和 python / js 等语言里的 object 看起来几乎一样,所以很多人会混为一谈. 不过即使如此,沟通的时候也无伤大雅。 楼上一堆人抠字眼嘲笑别人说这不是 json(是 python dict), 虽然说得对,但是没有帮助. 从楼主贴的数据来看,你的代码应该没有问题。 我猜测你贴出的数据可能已经不准确了,或者不完整。 尤其"print(r.json())会多出一个[].感觉有点奇怪啊"这句, 你贴的数据里并没有你说的多出的 []。 |
8
xuanbg 2022-02-05 08:42:35 +08:00
因为后端返回的数据不是 json 呀。。。当然,这个问题前端自己也能处理,单引号替换为双引号,None 替换成 null 就好了。但个人不建议这么做,因为问题的根源在后端,正确的办法是找后端,让他返回正确的数据。
|
9
sjmcefc2 OP # 这是一个示例 Python 脚本。
# 按 Shift+F10 执行或将其替换为您的代码。 # 按 双击 Shift 在所有地方搜索类、文件、工具窗口、操作和设置。 import requests import random import time import pandas as pd import json download_path = 'http://static.cninfo.com.cn/' saving_path = 'C:/Users/ja/PycharmProjects/report2020' User_Agent = [ ###这里自建一个 User_Agent 列表 ] # User_Agent 的集合 headers = {'Accept': 'application/json, text/javascript, */*; q=0.01', "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-HK;q=0.6,zh-TW;q=0.5", 'Host': 'www.cninfo.com.cn', 'Origin': 'http://www.cninfo.com.cn', 'Referer': 'http://www.cninfo.com.cn/new/commonUrl?url=disclosure/list/notice', 'X-Requested-With': 'XMLHttpRequest' } ###巨潮要获取数据,需要 ordid 字段,具体 post 的形式是'stock':'证券代码,ordid;' def get_orgid(Namelist): orglist = [] url = 'http://www.cninfo.com.cn/new/information/topSearch/detailOfQuery' hd = { 'Host': 'www.cninfo.com.cn', 'Origin': 'http://www.cninfo.com.cn', 'Pragma': 'no-cache', 'Accept-Encoding': 'gzip,deflate', 'Connection': 'keep-alive', 'Content-Length': '70', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': 'application/json,text/plain,*/*', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'} for name in Namelist: data = {'keyWord': name, 'maxSecNum': 10, 'maxListNum': 5, } #print(data) r = requests.post(url, headers=hd, data=data) #print(r.json()['keyBoardList']) org_id = r.json()['keyBoardList'][0]['orgId'] #print(org_id+'****'+name) orglist.append(org_id) ##对列表去重 formatlist = list(set(orglist)) formatlist.sort(key=orglist.index) return formatlist def single_page(stock): query_path = 'http://www.cninfo.com.cn/new/hisAnnouncement/query' headers['User-Agent'] = random.choice(User_Agent) # 定义 User_Agent print(stock) query = {'pageNum': 1, # 页码 'pageSize': 30, 'tabName': 'fulltext', 'column': 'szse', 'stock': stock, 'searchkey': '', 'secid': '', 'plate': '', 'category': 'category_ndbg_szsh;', # 年度报告 'trade': '', # 行业 'seDate': '2020-11-27~2021-05-28' # 时间区间 } namelist = requests.post(query_path, headers=headers, data=query) single_page = namelist.json()['announcements'] print(len(single_page)) return single_page # json 中的年度报告信息 def saving(single_page): # 下载年报 headers = {'Host': 'static.cninfo.com.cn', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36 Edg/90.0.818.66', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Cookie': 'routeId=.uc1' } for i in single_page: if ('2020 年年度报告(更新后)' in i['announcementTitle']) or ('2020 年年度报告' in i['announcementTitle']) or ( '2020 年年度报告(修订版)' in i['announcementTitle']): download = download_path + i["adjunctUrl"] name = i["secCode"] + '_' + i['secName'] + '_' + i['announcementTitle'] + '.pdf' file_path = saving_path + '/' + name print(file_path) time.sleep(random.random() * 2) headers['User-Agent'] = random.choice(User_Agent) r = requests.get(download, headers=headers) time.sleep(10) print(r.status_code) f = open(file_path, "wb") f.write(r.content) f.close() else: continue if __name__ == '__main__': Sec = pd.read_excel('listed.xlsx', dtype={'code': 'object'}) # 读取 excel,证券代码+证券简称 #print(Sec) Seclist = list(Sec['code']) # 证券代码转换成 list #print(Seclist) Namelist = list(Sec['name']) #print(Namelist) org_list = get_orgid(Namelist) Sec['orgid'] = org_list Sec.to_excel('listed.xlsx', sheet_name='Sheet2', index=False) stock = '' count = 0 ##按行遍历 for rows in Sec.iterrows(): stock = str(rows[1]['code']) + ',' + str(rows[1]['orgid']) + ';' try: page_data = single_page(stock) except: print('page error, retrying') try: page_data = single_page(stock) except: print('page error!') saving(page_data) count = count + 1 print('共有', count, '家券商') |
10
sjmcefc2 OP 大概思路就是想着从巨潮网络下载 pdf 年报,取一个 org_id ,拼接出下载链接
|
13
guoqiao 2022-02-05 15:25:41 +08:00
*哪个
|
14
sjmcefc2 OP 总是提醒我有非法 url
{'keyBoardList': [], 'classifiedAnnouncements': []} [] |
15
krixaar 2022-02-07 13:51:18 +08:00
拿#9 代码去掉两行注释直接跑的,name='平安银行',结果正常,无法复现。
![test9.jpg]( https://vip1.loli.io/2022/02/07/mbHetTLlVG4vsdg.jpg) |