avatar

目录
Python 爬虫学习记录

爬虫学习

思路

  • 向浏览器发送请求,获取响应
  • 得到 200回显 后,需要进行html解析
    • 创建一个新的文件来解析 html(通过正则匹配
    • 调用 bs4 解析我们的html (html5lib –可以通过浏览器的形式取解析网页)
    • 需要去获取我们需要的信息,例如:图片 -> img 标签 -> src属性值
    • 再通过src路径向浏览器发送请求
    • 请求成功以后,将我们的图片信息以二进制的方式进行写入我们的文件中
    • 实现批量爬取

实例1 批量获取图片

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import os
import requests
from bs4 import BeautifulSoup

# 文件下载路径
download = "images/"
if not os.path.exists(download):
os.mkdir(download)

for i in range(1,20):
if i == 1:
# 爬取网站路径
url="http://sc.chinaz.com/tupian/index.html"
else:
url = "http://sc.chinaz.com/tupian/index_%d.html"%i

# 发送请求获取响应 成功状态码为200
response = requests.get(url)
if response.status_code == 200:
#使用bs取解析网页
bs = BeautifulSoup(response.content,"html5lib")
#定位到网页中存放图片的DIV attrs 默认采用的是class
warp = bs.find("div", attrs={"id": "container"})
# 获取我们的img # 音频 audio src
imglist = warp.find_all("img")
# 获取图片的名称 --debug来检查代码
for img in imglist:
title = img["alt"]
src = img["src2"]
#需要再一次取请求图片地址 并且将图片放入文件夹中
with open(download+title+".jpg),"wb")as file:
file.write(requests.get(src).content)
print("第%d页打印完成"%i)

实例2 单张小说

初识python 之 爬虫:BeautifulSoup 的 find、find_all、select 方法

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import os
import requests
from bs4 import BeautifulSoup

download = "novel/"
if not os.path.exists(download):
os.mkdir(download)
# 发送请求获取响应
url = "https://www.biqukan.com/42_42882/14586139.html"
reponse = requests.get(url)
# 解析响应内容
if reponse.status_code == 200:
bs = BeautifulSoup(reponse.content,"html5lib")
# 获取标题
# bs.find()
# bs.findAll()
# bs.find_all("div",attrs={"":""})
h1 = bs.select_one("h1")
title = h1.text
# 找当前文章的文本
showtxt = bs.select_one(".showtxt").text
# 对文本进行操作 想去掉不需要的东西(字符串类型)---replace
res = showtxt.replace("app2();","")
with open(download+title+".txt","w",encoding="utf-8") as file:
file.write(res)

实例3 整本小说

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import os
import requests
from bs4 import BeautifulSoup

header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}

url = "https://www.biqukan.com/42_42882/"
response = requests.get(url, header=header)
response.encoding='utf-8'

if response.status_code == 200:
bs = BeautifulSoup(response.content,"html5lib")
title = bs.select_one("h2").text+"/"
# 在进行到我们爬取标题的时候创建以标题命名的文件夹
# 文件下载路径 = "xxx/"
if not os.path.exists(title):
os.mkdir(title)
# 找我们的div标签
listman = bs.select_one(".listmain")
# 找到所有的a标签
alist = listman.select("a")
for a in alist:
aTitle = a.text
href = a["href"].split("/")[-1]
# print(aTitle,href)
# 再一次请求进入三级域名
urlChild = url+href
responseChild = requests.get(urlChild)
if responseChild.status_code == 200:
bsChild = BeautifulSoup(responseChild.content,"html5lib")
# 找当前文章的文本 type(showtxt) (str)
showtxt = bsChild.select_one(".showtxt").text
# 对文本进行操作 去掉不想要的 replace
res = showtxt.replace("app2();", "")
with open(title + aTitle + ".txt", "w", encoding="utf-8") as file:
file.write(res)

爬虫学习2

自己也不知道这几天怎么过来的,反正说学到多少,也没多少,说没学,这两周每天一整个下午都在学这个。

爬音频

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import os
import requests
from bs4 import BeautifulSoup

download = "music/"
if not os.path.exists(download):
os.mkdir(download)

url = "https://www.i4.cn/ring_22_0_1.html"
response = requests.get(url)
if response.status_code == 200:
bs = BeautifulSoup(response.content, "html5lib")
# 寻找存放音乐的 div find 类似
kbox = bs.select(".kbox")
# 找 ring_list
ring = kbox.select(".ring_list")
for li in ring:
title = li.select(".title").text
# title = li.select(".title")["title"]
# print(title)
src = li.select(".audio_play")["data-mp3"]
with open(download + title + ".mp3", "wb") as file:
r = requests.get(src)
file.write(r.content)

只是引用的包不同,所以存在一些写法不同

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import os
import requests
import urllib.request as request

download = "music/"
if not os.path.exists(download):
mkdir(download)

url="https://www.i4.cn/ring_22_0_1.html"

response = request.urlopen(url)
# print(response) 二进制读取

bs = BeautifulSoup(response.content, "html5lib")
# 寻找存放音乐的 div find 类似
kbox = bs.select(".kbox")
# 找 ring_list
ring = kbox.select(".ring_list")
for li in ring:
title = li.select(".title").text
src = li.select(".audio_play")["data-mp3"]
request.urlretrieve(src, download + title +".mp3")

爬豆瓣的评论,帖子

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os
import urllib
from bs4 import BeautifulSoup
import csv

header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'
}

download = "comment/"
if not os.path.exists(download):
os.mkdir(download)

# 寄生虫
url="https://movie.douban.com/subject/27010768/"

req = urllib.request.Request(url=url, headers=header, method='GET')
print (req)
response = urllib.request.urlopen(req)
# print(response.read()) # 二进制读取

bs = BeautifulSoup(response.read(), "html5lib")
# 寻找豆瓣评论的div
comments = bs.select_one("#comments-section")
# 找 单个评论
shortInfo = comments.select(".comment")
# 创建一个list 列表 存放数据
comlist = []
for li in shortInfo:
info = li.select_one(".comment-info")
# 评论人
name = info.select_one("a").text
# 评论信息
shortcom = li.select_one(".short").text
# 将数据写入 csv 文件
# 将数据转为 字典
commentText = {"name":name, "comment-info":shortcom}
comlist.append(commentText)

with open("寄生虫.csv", "w", newline="", encoding="utf-8") as file:
csvWr = csv.writer(file)
for Scom in comlist:
csvWr.writerow([Scom["name"], Scom["comment-info"]])
# csvWr.writerows([Scom["name"], Scom["comment-info"]])
# 多打s 会造成每个字符都变成一个单元格属性

这边最后那里还是可以注意一下,一些细节问题,会导致最后的结果出现偏差


爬视频

首先要去下载一个 ffmpeg ,这个东西可以把视频流源爬下来

python
1
2
3
4
5
6
7
8
9
10
11
import request
from bs4 import BeautifulSoup

url = "http://jx618g.com/?url="
response = requests.get(url)
bs = BeautifulSoup(response.content, "html5lib")
print(bs)

# 找到 src m3u8

# cmd: ffmpeg -i m3u8地址 -vcodec copy -acodec copy xxx.mp4

日常瞎扯一会

难得有时间逼逼两句,本来打算找个好点的图床,毕竟一直白嫖微博好像也不太好,可是发现没几个好的,想了下,等过段时间 coding 那边升级结束后,再把自己的blog 搬过去一份好了,网站备案还在考虑中,实在不行以后就自己把这些全一股脑扔 one note 算了,反正站小,一年几百块的cdn 想了下好像也没那么必要出这个钱,等以后自己能成大佬了,站流量大了再说吧。

文章作者: 晓黑
文章链接: https://www.suk1.top/2020/02/28/pySpider/
版权声明: 本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 Manayakko - 微笑才是王道
打赏
  • 微信
    微信