半次元图片爬虫
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97 | import requests
import json
import re
import os
import threading
from lxml import etree
def Requests(url):
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'}
n = 0
while True:
n += 1
try:
response = requests.get(url, headers=head, timeout=10)
except:
pass
else:
if response.status_code == 200:
return response
if n >= 10:
return False
def folder_mkdir(title):
if os.path.exists(os.getcwd() + '\\'+title+"\\"):
pass
else:
os.mkdir(os.getcwd() + '\\'+title+"\\")
folder = os.getcwd() +'\\'+ title+"\\"
return folder
def install_img(url, folder, name):
try:
img_content = Requests(url).content
except:
print('错误:{} 名称:{}'.fomat(url, name))
else:
open(folder + name, 'wb').write(img_content)
def get_data(item_host_url):
mode = 'thread' # 多线程下载模式
item_response = Requests(item_host_url)
response = etree.HTML(item_response.text)
title = response.xpath('//title/text()')[0]
folder = folder_mkdir(title)
# print(item_response.text)
if item_response == False:
return
item_response.encoding = 'utf-8'
try:
item_data = \
re.findall('window.__ssr_data = JSON.parse\("(.*?)"\);\n window._UID_ = \'0\';', item_response.text)[
0].replace('\\"', '"').replace('u002F', '').replace('\\\\', '/')
# print(item_data)
except:
pass
else:
item_img_data = json.loads(item_data, strict=False)['detail']['post_data']['multi']
# print(item_img_data)
num = len(os.listdir(folder)) #避免覆盖的
# print(num)
for img_data in item_img_data:
img_url = img_data['original_path']
print(img_url)
if img_url.find('jpg') >= 0:
img_fomat = '.jpg'
elif img_url.find('png') >= 0:
img_fomat = '.png'
else:
img_fomat = '.jpg'
num += 1
name = str(num) + img_fomat
if mode == 'thread':
t = threading.Thread(target=install_img, args=(img_url, folder, name)).start()
while True:
if len(threading.enumerate()) <= 25:
break
else:
install_img(img_url, folder, name)
if __name__ == '__main__':
while True:
item_host_url = input("请输入url地址:")
# item_host_url = 'https://bcy.net/item/detail/6625993367937351944?_source_page=cos'
get_data(item_host_url)
print('下载完成')
|
Requests()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 | import requests
def Requests(url):
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'}
n = 0 #计算请求次数,错误过多则跳过此链接
while True:
n += 1
try:
response = requests.get(url,headers = head,timeout = 10)
except:
pass
else:
if response.status_code == 200:
return response
if n >= 10:
return False
|
构建一个常用的请求模块,在错误时重新请求,超过一定请求次数后跳过
folder_mkdir()
| import os
def folder_mkdir():
if os.path.exists(os.getcwd()+'\\pic'):
pass
else:
os.mkdir(os.getcwd()+'\\pic')
folder = os.getcwd()+'\\pic\\'
return folder
|
创建一个空文件夹用于储存图片
get_data()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33 | def get_data(item_host_url):
mode = 'thread' #多线程下载模式
folder = folder_mkdir()
item_response = Requests(item_host_url)
if item_response == False:
return
item_response.encoding = 'utf-8'
try:
item_data = re.findall('window.__ssr_data = JSON.parse\("(.*?)"\);\n window._UID_ = \'0\';',item_response.text)[0].replace('\\"','"').replace('u002F','').replace('\\\\','/')
except:
pass
else:
item_img_data = json.loads(item_data,strict=False)['detail']['post_data']['multi']
num = len(os.listdir(folder))
print(num)
for img_data in item_img_data:
img_url = img_data['original_path']
if img_url.find('jpg') >= 0:
img_fomat = '.jpg'
elif img_url.find('png') >= 0:
img_fomat = '.png'
else:
img_fomat = '.jpg'
num += 1
name = str(num)+img_fomat
if mode == 'thread':
t = threading.Thread(target = install_img,args = (img_url,folder,name)).start()
while True:
if len(threading.enumerate()) <= 25:
break
else:
install_img(img_url,folder,name)
|
请求图集链接,获取网页源码,从