V2EX = way to explore
V2EX 是一个关于分享和探索的地方
现在注册
已注册用户请  登录
推荐学习书目
Learn Python the Hard Way
Python Sites
PyPI - Python Package Index
http://diveintopython.org/toc/index.html
Pocoo
值得关注的项目
PyPy
Celery
Jinja2
Read the Docs
gevent
pyenv
virtualenv
Stackless Python
Beautiful Soup
结巴中文分词
Green Unicorn
Sentry
Shovel
Pyflakes
pytest
Python 编程
pep8 Checker
Styles
PEP 8
Google Python Style Guide
Code Style from The Hitchhiker's Guide
Gary_Cheung
V2EX  ›  Python

爱屋吉屋页面爬下来是空的,该肿么解决?

  •  
  •   Gary_Cheung · 2016-04-29 14:33:27 +08:00 · 4513 次点击
    这是一个创建于 3131 天前的主题,其中的信息可能已经有所发展或是发生改变。
    iwjw.com 的房源,全部是空页面,肿么办?

    headers = {
    'Cookie':'IW_UUID2_COOKIES=6a2030687b904bb8bce1c6846c99b1a6; IW_UUID_COOKIES=e3e01670134042d494cd70c725c4698f; iw_user_last_housetype_h5=2; Hm_lvt_3a10bbf8b4afa9e5ab91d66b6944e813=1461846272,1461846860; Hm_lpvt_3a10bbf8b4afa9e5ab91d66b6944e813=1461909938; iw_user_last_housetype=2'
    }
    url = 'http://m.iwjw.com/sale/shanghai/?kw=%E5%8F%A4%E6%A1%90%E5%85%AC%E5%AF%93'
    web_data = requests.get(url,headers=headers)
    soup = BeautifulSoup(web_data.text,'lxml')

    得到的结果:

    /Library/Frameworks/Python.framework/Versions/3.5/bin/python3.5 /Users/Emma_Tang/PycharmProjects/house_to_buy/get_house_iwjw.py
    <!DOCTYPE html>
    <html>
    <head>
    <meta charset="utf-8"/>
    <meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
    <title>二手房列表-上海--爱屋吉屋</title>
    <meta content="" name="keywords"/>
    <meta content="" name="description"/>
    <meta content="no-cache, no-store, must-revalidate" http-equiv="Cache-Control"/>
    <meta content="no-cache" http-equiv="Pragma"/>
    <meta content="0" http-equiv="Expires"/>
    <meta content="爱屋吉屋" name="apple-mobile-web-app-title"/>
    <meta content="width=device-width,initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
    <meta content="portrait" name="screen-orientation"/>
    <meta content="portrait" name="x5-orientation"/>
    <meta content="yes" name="apple-mobile-web-app-capable"/>
    <link href="http://resource.iwjw.com/iwjw-h5/img/common/screen_logo.jpg" rel="apple-touch-icon-precomposed"/>
    <link href="http://resource.iwjw.com/iwjw-h5/common_4.3.2.1.css" rel="stylesheet"/>
    <link href="http://resource.iwjw.com/iwjw-h5/list_4.2.css" rel="stylesheet"/>
    <script>
    window.pageConfig = {
    siteUrl: "http://www.iwjw.com",
    mobileSiteUrl: 'http://m.iwjw.com',
    datacollectUrl:'http://collect.iwjwagent.com/dataCollect/',
    recommendUrl:'http://recommend.iwjw.com',
    staticUrl: 'http://resource.iwjw.com/iwjw-h5/',
    sell: 'true' == 'true' ? true : false,
    rent: 'true' == 'true' ? true : false,
    provincepy: "shanghai",
    provinceid: "2",
    locationsJson: [{"flag":"01","id":2,"lat":31.238092,"lineCode":0,"lon":121.479659,"name":"上海","shortSpell":"sh","spell":"shanghai"},{"flag":"02","id":12438,"lat":39.920176,"lineCode":0,"lon":116.407808,"name":"北京","shortSpell":"bj","spell":"beijing"},{"flag":"04","id":40000,"lat":23.135736,"lineCode":0,"lon":113.271143,"name":"广州","shortSpell":"gz","spell":"guangzhou"},{"flag":"03","id":56000,"lat":22.547923,"lineCode":0,"lon":114.063918,"name":"深圳","shortSpell":"sz","spell":"shenzhen"},{"flag":"","id":71049,"lat":30.281202,"lineCode":0,"lon":120.161596,"name":"杭州","shortSpell":"hz","spell":"hangzhou"},{"flag":"","id":71099,"lat":39.137214,"lineCode":0,"lon":117.186954,"name":"天津","shortSpell":"tj","spell":"tianjin"},{"flag":"","id":86724,"lat":32.058854,"lineCode":0,"lon":118.784457,"name":"南京","shortSpell":"nj","spell":"nanjing"},{"flag":"","id":86725,"lat":30.595037,"lineCode":0,"lon":114.291675,"name":"武汉","shortSpell":"wh","spell":"wuhan"},{"flag":"","id":98289,"lat":30.661125,"lineCode":0,"lon":104.071599,"name":"成都","shortSpell":"cd","spell":"chengdu"},{"flag":"","id":98290,"lat":29.561329,"lineCode":0,"lon":106.548784,"name":"重庆","shortSpell":"cq","spell":"chongqing"}],
    ht: '2',
    platform: ''
    };
    var _hmt = _hmt || [];
    </script>
    </head>
    <body>
    <script>
    window.pageConfig = window.pageConfig || {};

    window.pageConfig.list = {
    provinceid: "2",
    provincepy: "shanghai",
    provincename: "上海",
    housetype: "2",
    siteUrl: "http://www.iwjw.com",
    prices: '[{"key":-1,"txt":"全部"},{"key":1,"txt":"100 万以下"},{"key":2,"txt":"100-150 万"},{"key":3,"txt":"150-200 万"},{"key":4,"txt":"200-300 万"},{"key":5,"txt":"300-500 万"},{"key":6,"txt":"500-700 万"},{"key":7,"txt":"700-1000 万"},{"key":8,"txt":"1000 万以上"}]',
    options: {
    kw: "古桐公寓",
    g: "0",
    areaId: "0",
    stationId: "0",
    ip: "-1",
    ia: "-1",
    sp: "-1",
    ep: "-1",
    sa: "-1",
    ea: "-1",
    rn: "-1",
    fe: "-1",
    dt: "-1",
    o: "0"
    }
    };
    </script><div id="wx_pic" style="position: absolute;top: -100000px;"><img src="http://resource.iwjw.com/iwjw-h5/img/common/wx_share_logo.jpg"/></div>
    <div id="iwjw">
    <div class="mod-h5-container">
    <div class="mod-list">
    </div>
    </div>
    </div>
    <script src="http://resource.iwjw.com/iwjw-h5/common_4.3.4.js"></script>
    <script src="http://resource.iwjw.com/iwjw-h5/list_4.3.4.js"></script>
    </body>
    <script>
    var _hmt = _hmt || [];
    (function() {
    var hm = document.createElement("script");
    hm.src = "//hm.baidu.com/hm.js?3a10bbf8b4afa9e5ab91d66b6944e813";
    hm.async = 1;
    var s = document.getElementsByTagName("script")[0];
    s.parentNode.insertBefore(hm, s);
    })();

    var _bdhmProtocol = (("https:" == document.location.protocol) ? " https://" : " http://");
    document.write(unescape("%3Cscript async src='" + _bdhmProtocol + "hm.baidu.com/h.js%3Fd3a10bbf8b4afa9e5ab91d66b6944e813' type='text/javascript'%3E%3C/script%3E"));
    </script></html>

    Process finished with exit code 0
    8 条回复    2016-04-30 17:03:31 +08:00
    seki
        1
    seki  
       2016-04-29 14:46:29 +08:00
    感觉是 ajax 请求的数据?
    用浏览器 dev tool 看看有没有 ajax 数据包,转为抓数据包
    Gary_Cheung
        2
    Gary_Cheung  
    OP
       2016-04-29 14:48:05 +08:00
    sorry ,我傻了,已经解决
    eoo
        3
    eoo  
       2016-04-29 15:01:56 +08:00 via Android
    。。。。。
    wisonic
        4
    wisonic  
       2016-04-29 16:01:36 +08:00 via iPhone
    都是假房源,爬来干嘛
    Gary_Cheung
        5
    Gary_Cheung  
    OP
       2016-04-29 16:20:15 +08:00
    @wisonic 纯属做个小练习 😄
    xustrive
        6
    xustrive  
       2016-04-29 18:53:43 +08:00
    @Gary_Cheung 根据一楼解决的?
    ayaseangle
        7
    ayaseangle  
       2016-04-29 22:20:07 +08:00 via Android
    大家爬来爬去。。。
    Gary_Cheung
        8
    Gary_Cheung  
    OP
       2016-04-30 17:03:31 +08:00
    @xustrive 是的
    关于   ·   帮助文档   ·   博客   ·   API   ·   FAQ   ·   实用小工具   ·   2700 人在线   最高记录 6679   ·     Select Language
    创意工作者们的社区
    World is powered by solitude
    VERSION: 3.9.8.5 · 25ms · UTC 06:41 · PVG 14:41 · LAX 22:41 · JFK 01:41
    Developed with CodeLauncher
    ♥ Do have faith in what you're doing.