爬虫
爬取有道翻译(POST后下载)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
# 取消SSL证书检测
import urllib.request
import urllib.parse
import json
import time
while True:
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
data = {}
data['i'] = input("请输入翻译内容:\n")
data['doctype'] = 'json'
data['keyfrom'] = 'fanyi.web'
data = urllib.parse.urlencode(data).encode('utf-8')
response = urllib.request.Request(url, data) #POST给网站要翻译的句子
response.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36') # 伪装访问方式
response = urllib.request.urlopen(response)
html = response.read().decode('utf-8')
target = json.loads(html)
print(target['translateResult'][0][0]['tgt'])
time.sleep(2)
BeautifulSoup爬HTML
查找
find_all(self, name=None, attrs={‘ ‘:’ ‘}, recursive=True, text=None, limit=None, **kwargs)
查找所有
self要查找的元素
name目标元素的名称
attrs元素的属性
recursive查找是否在节点子树下展开
支持自己定义函数查找
find(self, name=None, attrs={‘ ‘:’ ‘}, recursive=True, text=None, limit=None, **kwargs)
查找第一个
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import urllib.request
from bs4 import BeautifulSoup
try:
url = 'https://liting1024.github.io/2020/02/20/Python/'
response=urllib.request.urlopen(url)
html=response.read().decode()
soup=BeautifulSoup(html,'lxml')
def endsWith(s,t):
if len(s)>=len(t):
return s[len(s)-len(t):]==t
return False
def myFilter(tag):
return (tag.name=='a' and tag.has_attr('href') and tag['href']=='/category' and endsWith(tag.text,'ies'))
# 元素类型为a,有超链接,且超链接为/category,内容以ies结尾
tag1=soup.find('h1')
tag2=soup.find_all('a',attrs={'class':'menu-item'})
tag3=soup.find_all(myFilter)
print(tag1,'\n',tag2,'\n',tag3)
for tag in tag2:
print(tag['href'])
for tag in tag2:
print(tag.text)
except Exception as err:
print(err)
遍历
tag.parent 父类树节点
tag.children 子节点
tag.descendants 所有子孙节点
tag.next_sibling 最近的下一个兄弟节点
tag.previous_sibling 上一个兄弟节点
CSS语法查找
soup.select(tagName, attName=value)
attName | 描述 |
---|---|
attName^=value | 以value开头匹配属性 |
attName$=value | 以value结尾 |
attName*=value | 包含指定值 |
soup.select("p a[rel='noopener']"))
# 查找p下的具有rel=‘noopenner’属性的a
soup.select("p > a")
# 查找p下的子节点a,不包含孙节点
soup.select("p ~ a")
# 查到p后面同级别的a
爬天气预报
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import urllib.request
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
try:
url = 'http://www.weather.com.cn/weather/101080101.shtml'
headers={"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"}
req=urllib.request.Request(url,headers=headers)
data=urllib.request.urlopen(req)
data=data.read()
dammit=UnicodeDammit(data,['utf-8','gdk'])
html=dammit.unicode_markup
# 自动选择解码
soup=BeautifulSoup(html, 'lxml')
lis=soup.select("ul[class='t clearfix'] li")
for li in lis:
date = li.select('h1')[0].text
weather = li.select("p[class='wea']")[0].text
temp1 = li.select("p[class='tem'] i")[0].text
if li.select("p[class='tem'] span")==[]:
temp=temp1
# temp2和temp1相等是无法搜索到temp2
else:
temp2=li.select("p[class='tem'] span")[0].text
temp=temp1+'/'+temp2
print(date, weather, temp)
except Exception as err:
print(err)
爬树
深度和广度类
class Stack: # 列表栈,深度
def __init__(self):
self.st=[]
def pop(self):
return self.st.pop()
def push(self,obj):
return self.st.append(obj)
def isempty(self):
return len(self.st)==0
class Queue: # 队列,广度
def __init__(self):
self.st=[]
def fetch(self):
return self.st.pop(0)
def enter(self,obj):
return self.st.append(obj)
def isempty(self):
return len(self.st)==0
多线程
t = Thread(target=,args=)
target:要执行的函数
args:一个元组或列表
from threading import Thread
t.setDaemon(False)
# 设定为后台线程
t.start()
# 启动线程
t.join()
# 阻塞当前线程,等t执行后继续执行
lock=threading._RLock()
# 设定一个锁
lock.acquire()
# lock获取线程锁,如果另一个线程调用了acquire而没有release则阻塞当前线程等待别的线程释放锁
lock.release()
# 释放锁
多线程爬天气网图片
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
from urllib import parse
import urllib.request
import threading
def imageSpider(start_url):
global threads
global count
try:
urls=[]
req=urllib.request.Request(start_url,headers=headers)
data=urllib.request.urlopen(req)
data=data.read()
dammit=UnicodeDammit(data, ["utf-8","gdk"])
data=dammit.unicode_markup
soup=BeautifulSoup(data,'lxml')
images=soup.select('img')
for image in images:
src=image['src']
url=parse.urljoin(start_url, src)
if url not in urls:
print(url)
urls.append(url)
count=count+1
T=threading.Thread(target=download,args=(url,count))
# 多线程运行download函数
T.setDaemon(False)
T.start()
threads.append(T)
except Exception as err:
print(err)
def download(url,count):
if url[len(url)-4]=='.':
ext=url[len(url)-4:]
else:
ext=''
req=urllib.request.Request(url, headers=headers)
data=urllib.request.urlopen(req,timeout=100)
data=data.read()
fobj=open('image\\'+str(count)+ext,'wb')
fobj.write(data)
fobj.close()
print('downloaded'+str(count)+ext)
start_url='http://www.weather.com.cn/weather1d/101080101.shtml'
headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
count=0
threads=[]
imageSpider(start_url)
for t in threads: #多线程等待后结束主程序
t.join()
print('END')
Scrapy
创建简单爬虫
在虚拟python环境中pip install scrapy
scrapy startproject XXX
生成爬虫名字为itcast,地址为itcast.cn
scrapy genspider itcast itcast.cn
启动爬虫不打印日志
scrapy crawl 爬虫名字 –nolog
在spider文件夹中建立py文件
import scrapy
class MySpider(scrapy.Spider):
name="mySpider"
def start_requests(self):
# 整个函数可以用start_urls = ['https://www.baidu.com']代替
url='https://www.baidu.com'
yield scrapy.Request(url=url, callback=self.parse)
# 访问网页回调callback函数,yield返回数据但可以不结束函数
def parse(self, response):
print(response.url)
data=response.body.decode()
print(data)
在XXX文件中建立py文件
from scrapy import cmdline
cmdline.execute('scrapy crawl mySpider -s LOG_ENABLED=False'.split())
查找HTML元素
from scrapy.selector import Selector
selector=Selector(text=html)
s=selector.xpath('//title')
# //表示在任何位置,/表示下一级,'//body/book'搜索body下一级的book
# selector.xpath('//book').xpath('/title')
# .xpath('//book').xpath('//title') 对每个book查找title
print(s)
解析为列表
s=selector.xpath(‘//title’).extract()
得到title组成的list,.extract_first()得到第一个元素
获取属性、文本
s=selector.xpath(‘//title/@id’).extract()
获取属性值
s=selector.xpath(“”//title[@id=’chinese’]/text()”).extract()
限定id属性的值筛选
s=selector.xpath(‘//title/text()’).extract()
获取title的文本
用*代表任何
用*代替任何节点,不包括Text、Comment节点
s=selector.xpath(“”//title[@*]”)
任何属性
position下角标
从1开始编号
//body/title[position( )>2] [position( )<5]
取第三到六个title
兄弟节点父节点
s= selector.xpath( //title[ @lang = ‘chinese ]/parent::*”)
查找属性为lang= chinese’的title的父节点
s=sclector.xpathC(“//b[position()= 1]following-sibling::* [position()=1]”)
搜索第一个b节点后面的第一个兄弟节点
“element/ollowing-sibling::*”搜索element后面的同级的所有兄弟节点
“element/preceding-sibling::*“搜索element 前面的同级的所有兄弟节点
“element/preceding-sibling::*[position()=1]”搜索element 前面的同级的第一个兄弟节
点
其他py文件
items.py储存数据
items中建立用于储存数据的类
class YourprojectItem(scrapy.Item): # 继承scrapy.Item
# define the fields for your item here like:
# name = scrapy.Field()
title=scrapy.Field()
在spiders中的爬虫中应用这个类
from yourProject.items import YourprojectItem
class MySpider(scrapy.Spider):
name="mySpider"
start_urls = ['https://www.baidu.com']
def parse(self, response):
data=response.body.decode()
selector=scrapy.Selector(text=data)
books=selector.xpath("//book")
for book in books:
item=YourprojectItem()
item["title"]=book.xpath("./title/text()").extract_first()
yield item
pipelines.py数据管道处理类
在settings.py中取消注释
ITEM_PIPELINES = {
‘yourProject.pipelines.YourprojectPipeline’: 300,
}
爬取一次数据,调用一次process_item函数
from itemadapter import ItemAdapter
class YourprojectPipeline(object):
count = 0
def process_item(self, item, spider):
YourprojectPipeline.count+=1
if YourprojectPipeline.count==1:
fobj=open("book.txt","wb")
else:
fobj=open("book.txt","at")
print(item["title"])
fobj.write(item['title'])
fobj.close()
return item