爬网页的一些tips

2017-11-02 | 阅读：次

 爬虫是获得数据强有力的工具，爬虫技巧需要一些积累，
 前些天实验需要写了一个爬虫记录一下以后可能用到的东西
 下面的代码大部分稍加改动都可以直接用的

用到的一些包
简单网站的网页获取
有js代码简单模仿人获取 by chrome
有js内容，不执行js，且定义变化头, firefox版
不执行js的终极方案

用到的一些包

import requests # 获取网页
from bs4 import BeautifulSoup # 解析网页
import re   #正则，用于解析网页
import random   #用于生成随机头

简单网站的网页获取

获取网页的python代码

def getHTMLText(url):

    def GenRand():
        return int(random.random() * 255) + 1

    if url == '':
        return ('')
    if 'http' not in url:
        url = 'https:' + url
    headers = {
        'User-Agent':
        'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0',
        'Content-Type':
        'application/x-www-form-urlencoded; charset=UTF-8',
        'Connection':
        'close'
    }
    k = 0
    result = ''
    while k < 32:
        try:
            headers.update({
                "X-FORWARDED-FOR":
                "{}.{}.{}.{}".format(GenRand(), GenRand(), GenRand(),
                                     GenRand())
            })
            k += 1
            if k != 1:
                print('retry')
            with requests.get(url, headers=headers, timeout=5) as html:
                result = html.text
                if html.status_code == 200:
                    break
        except Exception as e:
            pass
    return result

小规模爬取数据，只需要在爬取时添加一个随机睡眠时间即可，时间长度具体问题具体分析。

有js代码简单模仿人获取 by chrome

有些网站只获得http代码，并不能得到全部你想要的信息，有时还需要执行js代码，此时可以通过模拟浏览器的方法曲线获得。

from selenium import webdriver

def gethtml(url, thnum):
    def GenRand():
        return int(random.random() * 255) + 1

    browser = webdriver.Chrome()
    try:
        browser.get(url)
        lastHeight = browser.execute_script(
            'return document.body.scrollHeight')
        while True:
            browser.execute_script(
                'window.scrollTo(0, document.body.scrollHeight);')
            time.sleep(4)
            newHeight = browser.execute_script(
                'return document.body.scrollHeight')
            if newHeight == lastHeight:
                break
            lastHeight = newHeight
        curhtml = browser.page_source
        browser.quit()
        return curhtml
    except Exception as e:
        browser.quit()
        print(e)
        return ''

小规模爬取，拟人操作，速度还凑合。

有js内容，不执行js，且定义变化头, firefox版

这里提供一个最简单的通过firefox获取网页代码，速度较慢，需要firefox插件modify_header.xpi

from selenium import webdriver

def gethtml(url, thnum):

    windowset = [[0, 0, 600, 400],
                 [600, 0, 600, 400],
                 [1200, 0, 600, 400],
                 [0, 400, 600, 400],
                 [600, 400, 600, 400],
                 [1200, 400, 600, 400]]

    def GenRand():
        return int(random.random() * 255) + 1

    profile = webdriver.FirefoxProfile()
    profile.add_extension("modify_header.xpi")
    # profile.set_preference("modifyheaders.config.active", True)
    profile.set_preference("modifyheaders.config.openNewTab", True)
    profile.set_preference("modifyheaders.config.alwaysOn", True)
    profile.set_preference("modifyheaders.headers.count", 1)
    profile.set_preference("modifyheaders.headers.action0", "Add")
    profile.set_preference("modifyheaders.headers.name0", "X-Forwarded-For")
    profile.set_preference("modifyheaders.headers.value0",
                           "{}.{}.{}.{}".format(GenRand(),
                                                GenRand(), GenRand(),
                                                GenRand()))
    profile.set_preference("modifyheaders.headers.enabled0", True)
    profile.update_preferences()
    browser = webdriver.Firefox(profile)
    browser.set_window_rect(windowset[thnum][0], windowset[thnum][1],
                            windowset[thnum][2], windowset[thnum][3])
    browser.set_page_load_timeout(50)
    try:
        browser.get(url)
        lastHeight = browser.execute_script(
            'return document.body.scrollHeight')
        while True:
            browser.execute_script(
                'window.scrollTo(0, document.body.scrollHeight);')
            time.sleep(4)
            newHeight = browser.execute_script(
                'return document.body.scrollHeight')
            if newHeight == lastHeight:
                break
            lastHeight = newHeight
        curhtml = browser.page_source
        browser.quit()
        return curhtml
    except Exception as e:
        print('error')
        browser.quit()
        print(e)
        return ''

这个方法速度慢，效果不理想

不执行js的终极方案

这个代码不单提供了模拟phantomjs，而且还将运用一个代理池，代理池需要自己搭建，如果代理质量不好（免费）的话，更新频率一定要快（自己把握，最好几秒一次，代理池里不用放很多代理，十来个就好了，本来失效就快）

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import random

cnt = 0
pool = dict()
l = 0

def getHTMLText(url, flag=1):
    global cnt, l, pool
    if url == '':
        return ('')
    if 'http' not in url:
        url = 'http:' + url

    dcap = dict(DesiredCapabilities.PHANTOMJS)
    dcap["phantomjs.page.settings.userAgent"] = (
        "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36"
    )
    dcap["phantomjs.page.settings.loadImages"] = False
    if flag == 1:
        proxies = {
            "http": pool[cnt],
            # "https": "http://10.10.1.10:1080",
        }
        dcap["phantomjs.page.settings.proxy"] = proxies
    driver = webdriver.PhantomJS(desired_capabilities=dcap)
    k = 0
    result = ''
    while k < 50:
        try:
            k += 1
            if k != 1:
                print('retry')
            driver.get(url)
            result = driver.page_source
            if driver.response.statusCode != 200:
                raise Exception
            driver.quit()

        except Exception as e:
            if cnt >= l - 2:
                cnt = 0
                getpool()
            else:
                cnt += 1
    return result


def getpool():
    """TODO: Docstring for getpool.
    :returns: TODO

    """
    pool.clear()
    html = getHTMLText('Your proxy pools', 0)
    # return
    ips = html.split()
    for i, ip in enumerate(ips):
        pool[i] = ip

getpool()
l = len(pool)