爬虫是获得数据强有力的工具,爬虫技巧需要一些积累,
前些天实验需要写了一个爬虫记录一下以后可能用到的东西
下面的代码大部分稍加改动都可以直接用的
用到的一些包
import requests # 获取网页
from bs4 import BeautifulSoup # 解析网页
import re #正则,用于解析网页
import random #用于生成随机头
简单网站的网页获取
获取网页的python代码
def getHTMLText(url):
def GenRand():
return int(random.random() * 255) + 1
if url == '':
return ('')
if 'http' not in url:
url = 'https:' + url
headers = {
'User-Agent':
'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0',
'Content-Type':
'application/x-www-form-urlencoded; charset=UTF-8',
'Connection':
'close'
}
k = 0
result = ''
while k < 32:
try:
headers.update({
"X-FORWARDED-FOR":
"{}.{}.{}.{}".format(GenRand(), GenRand(), GenRand(),
GenRand())
})
k += 1
if k != 1:
print('retry')
with requests.get(url, headers=headers, timeout=5) as html:
result = html.text
if html.status_code == 200:
break
except Exception as e:
pass
return result
小规模爬取数据,只需要在爬取时添加一个随机睡眠时间即可,时间长度具体问题具体分析。
有js代码 简单模仿人获取 by chrome
有些网站只获得http代码,并不能得到全部你想要的信息,有时还需要执行js代码,此时可以通过模拟浏览器的方法曲线获得。
from selenium import webdriver
def gethtml(url, thnum):
def GenRand():
return int(random.random() * 255) + 1
browser = webdriver.Chrome()
try:
browser.get(url)
lastHeight = browser.execute_script(
'return document.body.scrollHeight')
while True:
browser.execute_script(
'window.scrollTo(0, document.body.scrollHeight);')
time.sleep(4)
newHeight = browser.execute_script(
'return document.body.scrollHeight')
if newHeight == lastHeight:
break
lastHeight = newHeight
curhtml = browser.page_source
browser.quit()
return curhtml
except Exception as e:
browser.quit()
print(e)
return ''
小规模爬取,拟人操作,速度还凑合。
有js内容,不执行js,且定义变化头, firefox版
这里提供一个最简单的通过firefox获取网页代码,速度较慢,需要firefox插件modify_header.xpi
from selenium import webdriver
def gethtml(url, thnum):
windowset = [[0, 0, 600, 400],
[600, 0, 600, 400],
[1200, 0, 600, 400],
[0, 400, 600, 400],
[600, 400, 600, 400],
[1200, 400, 600, 400]]
def GenRand():
return int(random.random() * 255) + 1
profile = webdriver.FirefoxProfile()
profile.add_extension("modify_header.xpi")
# profile.set_preference("modifyheaders.config.active", True)
profile.set_preference("modifyheaders.config.openNewTab", True)
profile.set_preference("modifyheaders.config.alwaysOn", True)
profile.set_preference("modifyheaders.headers.count", 1)
profile.set_preference("modifyheaders.headers.action0", "Add")
profile.set_preference("modifyheaders.headers.name0", "X-Forwarded-For")
profile.set_preference("modifyheaders.headers.value0",
"{}.{}.{}.{}".format(GenRand(),
GenRand(), GenRand(),
GenRand()))
profile.set_preference("modifyheaders.headers.enabled0", True)
profile.update_preferences()
browser = webdriver.Firefox(profile)
browser.set_window_rect(windowset[thnum][0], windowset[thnum][1],
windowset[thnum][2], windowset[thnum][3])
browser.set_page_load_timeout(50)
try:
browser.get(url)
lastHeight = browser.execute_script(
'return document.body.scrollHeight')
while True:
browser.execute_script(
'window.scrollTo(0, document.body.scrollHeight);')
time.sleep(4)
newHeight = browser.execute_script(
'return document.body.scrollHeight')
if newHeight == lastHeight:
break
lastHeight = newHeight
curhtml = browser.page_source
browser.quit()
return curhtml
except Exception as e:
print('error')
browser.quit()
print(e)
return ''
这个方法速度慢,效果不理想
不执行js的终极方案
这个代码不单提供了模拟phantomjs,而且还将运用一个代理池,代理池需要自己搭建,如果代理质量不好(免费)的话,更新频率一定要快(自己把握,最好几秒一次,代理池里不用放很多代理,十来个就好了,本来失效就快)
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import random
cnt = 0
pool = dict()
l = 0
def getHTMLText(url, flag=1):
global cnt, l, pool
if url == '':
return ('')
if 'http' not in url:
url = 'http:' + url
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36"
)
dcap["phantomjs.page.settings.loadImages"] = False
if flag == 1:
proxies = {
"http": pool[cnt],
# "https": "http://10.10.1.10:1080",
}
dcap["phantomjs.page.settings.proxy"] = proxies
driver = webdriver.PhantomJS(desired_capabilities=dcap)
k = 0
result = ''
while k < 50:
try:
k += 1
if k != 1:
print('retry')
driver.get(url)
result = driver.page_source
if driver.response.statusCode != 200:
raise Exception
driver.quit()
except Exception as e:
if cnt >= l - 2:
cnt = 0
getpool()
else:
cnt += 1
return result
def getpool():
"""TODO: Docstring for getpool.
:returns: TODO
"""
pool.clear()
html = getHTMLText('Your proxy pools', 0)
# return
ips = html.split()
for i, ip in enumerate(ips):
pool[i] = ip
getpool()
l = len(pool)