
#coding=utf8
import math
import os
from threading import Thread
import sqlite3
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
def getHtml(url):
try:
response = requests.get(url=url,timeout=10,verify=False)
response.encoding = “GBK”
soup = BeautifulSoup(response.text, ‘html.parser’)
html = soup.select_one(“.tpc_content”)
list_img = html.find_all(‘img’)
list_img_url = []
for imgone in list_img:
img_url = imgone.get(‘datasrc’)
list_img_url.append(img_url)
return list_img_url
except:
print(“解析:”+url+”失败”)
return []
def gettitle(url):
response = requests.get(url=url, timeout=10, verify=False)
response.encoding = “GBK”
soup = BeautifulSoup(response.text, ‘html.parser’)
title = soup.find(“title”).text.split(” “)[0]
return title
def getHtml2(url):
try:
response = requests.get(url=url,timeout=10,verify=False)
response.encoding = “GBK”
soup = BeautifulSoup(response.text, ‘html.parser’)
html = soup.select_one(“.tpc_content”)
list_img = html.find_all(‘input’, {“type”: “image”})
list_img_url = []
for imgone in list_img:
img_url = imgone.get(‘datasrc’)
list_img_url.append(img_url)
return list_img_url
except:
print(“解析:”+url+”失败”)
return []
def getHtml3(url):
try:
response = requests.get(url=url,timeout=10,verify=False)
response.encoding = “GBK”
soup = BeautifulSoup(response.text, ‘html.parser’)
list_img = soup.find_all(‘input’, {“type”: “image”})
if len(list_img) > 2:
del list_img[0]
list_img_url = []
for imgone in list_img:
img_url = imgone.get(‘datasrc’)
list_img_url.append(img_url)
return list_img_url
except:
print(“解析:”+url+”失败”)
return []
def getHtml4(url):
try:
response = requests.get(url=url,timeout=10,verify=False)
response.encoding = “GBK”
soup = BeautifulSoup(response.text, ‘html.parser’)
list_img2 = soup.find_all(‘img’)
list_img_url = []
for imgone in list_img2:
img_url = imgone.get(‘datasrc’)
if img_url is not None:
list_img_url.append(img_url)
if len(list_img_url)>2:
del list_img_url[0]
return list_img_url
except:
print(“解析:”+url+”失败”)
return []
def myse(path,img_url):
try:
print(“开始保存:”+path)
response = requests.get(img_url, timeout=10, verify=False)
# image = Image.open(BytesIO(response.content))
# image.se(path)
with open(path, ‘wb’) as f:
f.write(response.content)
print(“保存成功:” + path)
except Exception as e:
print(“保存失败:” + path + “:” + img_url)
def seimag(img_url,title,filename,i,path):
houzhui=”.”+img_url.split(“.”)[1]
if filename==””:
img_name=title + “” + str(i)+houzhui
path = path+img_name
sqliteDb(img_name, title, img_url)
if os.path.isfile(path):
if os.path.getsize(path) >0:
print(“已存在:” + path)
else:
myse(path,img_url)
else:
myse(path,img_url)
else:
img_name = title + “” + str(i) + houzhui
path = path + img_name
sqliteDb(img_name, title, img_url)
if os.path.isfile(path):
if os.path.getsize(path) > 0:
print(“已存在:” + path)
else:
myse(path, img_url)
def urlrun(url,filename,path):
filename=””
if getHtml(url) is not None:
img_list_url = getHtml(url)
img_list_len=len(img_list_url)
print(“第一种方法图片个数:”+str(img_list_len))
if img_list_len<=2:
img_list_url = getHtml2(url)
img_list_len = len(img_list_url)
print(“第二种方法图片个数:” + str(img_list_len))
if img_list_len<=2:
img_list_url = getHtml3(url)
img_list_len = len(img_list_url)
print(“第三种方法图片个数:” + str(img_list_len))
if img_list_len<=2:
img_list_url = getHtml4(url)
img_list_len = len(img_list_url)
print(“第四种方法图片个数:” + str(img_list_len))
if img_list_len <= 2:
print(url)
print(“有问题”)
title=gettitle(url)
print(“标题为:” + title)
i = 0
for img_url in img_list_url:
i = i + 1
seimag(img_url, title, filename, i,path)
# w, h = image.size
# image.thumbnail((w // 2, h // 2))
print(“结束:”)
print(url)
else:
pass
def threadrun(list_url2,path):
for url in list_url2:
urlrun(url,None,path)
def sqliteDb(name,title,url):
try:
con = sqlite3.connect(“/home/test/PycharmProjects/MyPython/picture/Rul.db”) # 数据库所在位置
cur = con.cursor()
# cur.execute(“CREATE TABLE IF NOT EXISTS caoliu(img_name TEXT PRIMARY KEY,title TEXT,imag_url TEXT)”)
sql = “select * from caoliu where img_name=?”
cur.execute(sql, (name,))
values = cur.fetchone()
if values is not None:
cur.execute(“UPDATE caoliu SET img_url=? WHERE img_name=?”, (url, name))
else:
data = “‘” + name + “‘,'” + title + “‘,'” + url + “‘”
cur.execute(‘INSERT INTO caoliu (img_name,title,img_url) VALUES (%s)’ % data)
con.commit()
# cur.execute(“SELECT * FROM caoliu”)
# for item in cur:
# print(item)
cur.close()
con.close()
except:
pass
if __name__ == ‘__main__’:
requests.packages.urllib3.disable_warnings()
print(“q为退出”)
while True:
list_url_str = input(“输入网址数列(,分割):”)
if list_url_str==”q”:
quit(0)
list_url = list_url_str.split(“,”)
path = input(“保存地址:”)
if path==”q”:
quit(0)
path = path.replace(” “, “”)
if path==””:
path=”/home/test/Pictures/picture/” ##默认保存地址
if “/” is not path[1]:
path = path + “/”
lens = len(list_url)
if lens >= 20:
size = (lens / 10)
size = math.ceil(size)
list_url2 = []
for i in range(0, 10):
list_url2 = list_url[i * size:i * size + size]
t = Thread(target=threadrun, args=(list_url2, path))
t.start()
t.join()
print(“全部结束”)
else:
size = (lens / 4)
size = math.ceil(size)
list_url2 = []
for i in range(0, 4):
list_url2 = list_url[i * size:i * size + size]
t = Thread(target=threadrun, args=(list_url2, path))
t.start()
t.join()
print(“全部结束”)
复制代码/home/test/PycharmProjects/MyPython/picture/Rul.db 为我的数据库地址 请修改为自己地址,数据库仅为存图片链接用 。
二:pyqt5版本
# * coding: utf8 *
# Form implementation generated from reading ui file ‘爬取pyqt测试.ui’
#
# Created by: PyQt5 UI code generator 5.13.2
#
# WARNING! All changes made in this file will be lost!
import os
import re
import requests
from PyQt5 import QtCore, QtGui, QtWidgets
from bs4 import BeautifulSoup
class Ui_Form(object):
def setupUi(self, Form):
Form.setObjectName(“Form”)
Form.resize(738, 611)
self.paquButton = QtWidgets.QPushButton(Form)
self.paquButton.setGeometry(QtCore.QRect(600, 230, 91, 31))
self.paquButton.setObjectName(“paquButton”)
self.label = QtWidgets.QLabel(Form)
self.label.setGeometry(QtCore.QRect(110, 70, 68, 23))
font = QtGui.QFont()
font.setPointSize(14)
self.label.setFont(font)
self.label.setObjectName(“label”)
self.lineEdit_url = QtWidgets.QLineEdit(Form)
self.lineEdit_url.setGeometry(QtCore.QRect(190, 70, 381, 31))
self.lineEdit_url.setObjectName(“lineEdit_url”)
self.lineEdit_path = QtWidgets.QLineEdit(Form)
self.lineEdit_path.setGeometry(QtCore.QRect(210, 230, 311, 31))
self.lineEdit_path.setObjectName(“lineEdit_path”)
self.label_2 = QtWidgets.QLabel(Form)
self.label_2.setGeometry(QtCore.QRect(110, 230, 101, 23))
font = QtGui.QFont()
font.setPointSize(14)
self.label_2.setFont(font)
self.label_2.setObjectName(“label_2”)
self.textBrowser_print = QtWidgets.QTextBrowser(Form)
self.textBrowser_print.setGeometry(QtCore.QRect(110, 290, 581, 241))
self.textBrowser_print.setObjectName(“textBrowser_print”)
self.lineEdit_guize = QtWidgets.QLineEdit(Form)
self.lineEdit_guize.setGeometry(QtCore.QRect(210, 150, 311, 31))
self.lineEdit_guize.setObjectName(“lineEdit_guize”)
self.label_3 = QtWidgets.QLabel(Form)
self.label_3.setGeometry(QtCore.QRect(110, 150, 91, 23))
font = QtGui.QFont()
font.setPointSize(14)
self.label_3.setFont(font)
self.label_3.setObjectName(“label_3”)
self.comboBox_yemian = QtWidgets.QComboBox(Form)
self.comboBox_yemian.setGeometry(QtCore.QRect(610, 70, 100, 31))
self.comboBox_yemian.setObjectName(“comboBox_yemian”)
self.comboBox_yemian.addItem(“”)
self.comboBox_yemian.addItem(“”)
self.label_4 = QtWidgets.QLabel(Form)
self.label_4.setGeometry(QtCore.QRect(540, 150, 91, 23))
self.label_4.setFont(font)
self.label_4.setObjectName(“label_4”)
self.lineEdit_shuxing = QtWidgets.QLineEdit(Form)
self.lineEdit_shuxing.setGeometry(QtCore.QRect(630, 150, 61, 31))
self.lineEdit_shuxing.setObjectName(“lineEdit_shuxing”)
self.retranslateUi(Form)
self.paquButton.clicked.connect(self.paqu)
QtCore.QMetaObject.connectSlotsByName(Form)
def retranslateUi(self, Form):
_translate = QtCore.QCoreApplication.translate
Form.setWindowTitle(_translate(“Form”, “图片爬取 by怪盗LYL”))
self.paquButton.setText(_translate(“Form”, “爬取”))
self.label.setText(_translate(“Form”, “网址:”))
self.label_2.setText(_translate(“Form”, “保存地址:”))
self.label_3.setText(_translate(“Form”, “解析规则:”))
self.comboBox_yemian.setItemText(0, _translate(“Form”, “单个页面爬取”))
self.comboBox_yemian.setItemText(1, _translate(“Form”, “需要翻页”))
self.label_4.setText(_translate(“Form”, “图片属性:”))
self.lineEdit_shuxing.setText(_translate(“Form”, ‘datasrc’))
def paqu(self):
requests.packages.urllib3.disable_warnings()
path=self.lineEdit_path.text()
url=self.lineEdit_url.text()
guize=self.lineEdit_guize.text()
yemian=str(self.comboBox_yemian.currentIndex())
# self.printf(“path:”+path)
# self.printf(“url:”+url)
# self.printf(“guize:”+guize)
# self.printf(“yemian:”+yemian)
if “0”==yemian:
self.urlrun(url=url,path=path)
def printf(self, mypstr):
self.textBrowser_print.append(mypstr) # 在指定的区域显示提示信息
self.cursor = self.textBrowser_print.textCursor()
self.textBrowser_print.moveCursor(self.cursor.End) # 光标移到最后,这样就会自动显示出来
QtWidgets.QApplication.processEvents() # 一定加上这个功能,不然有卡顿
def getHtml(self,url,shuxing,guize):
try:
response = requests.get(url=url, timeout=10, verify=False)
response.encoding = “GBK”
soup = BeautifulSoup(response.text, ‘html.parser’)
# print(soup.contents)
# print(shuxing)
# print(self.lineEdit_guize)
pattern = re.compile(guize, re.S)
item_list = pattern.findall(str(soup.contents))
img_list = []
for i in item_list:
# print(i)
for b in i:
if shuxing in b:
shuxing_pipei = shuxing + ‘=”(.*?)”‘
pattern = re.compile(shuxing_pipei, re.S)
img_url = pattern.findall(b)
# print(img_url)
img_list.append(img_url[0])
return img_list
except Exception as e:
self.printf(e.__str__())
self.printf(“解析:” + url + “失败”)
return None
def gettitle(self,url):
try:
response = requests.get(url=url, timeout=10, verify=False)
response.encoding = “GBK”
soup = BeautifulSoup(response.text, ‘html.parser’)
title = soup.find(“title”).text.split(” “)[0]
return title
except Exception as e:
self.printf(e.__str__())
return None
def myse(self,path, img_url):
try:
self.printf(“开始保存:” + path)
response = requests.get(img_url, timeout=10, verify=False)
# image = Image.open(BytesIO(response.content))
# image.se(path)
with open(path, ‘wb’) as f:
f.write(response.content)
self.printf(“保存成功:” + path)
except Exception as e:
self.printf(e.__str__())
self.printf(“保存失败:” + path + “:” + img_url)
def seimag(self,img_url, title, i, path):
houzhui = “.” + img_url.split(“.”)[1]
img_name = title + “” + str(i) + houzhui
if not os.path.isdir(path):
os.mkdir(path)
path = path + img_name
if os.path.isfile(path):
if os.path.getsize(path) > 0:
self.printf(“已存在:” + path)
else:
self.myse(path, img_url)
else:
self.myse(path, img_url)
def urlrun(self,url, path):
if “/” is not path[1]:
path = path + “/”
if self.getHtml(url,shuxing=self.lineEdit_shuxing.text(),guize=self.lineEdit_guize.text()) is not None:
img_list_url = self.getHtml(url,shuxing=self.lineEdit_shuxing.text(),guize=self.lineEdit_guize.text())
img_list_len = len(img_list_url)
if img_list_len == 0:
self.printf(“图片个数:” + str(img_list_len))
self.printf(“结束:”)
self.printf(url)
else:
self.printf(“图片个数:” + str(img_list_len))
if self.gettitle(url) is not None:
title = self.gettitle(url)
else:
title = “liuyiliux:标题获取失败”
self.printf(“标题为:” + title)
i = 0
for img_url in img_list_url:
i = i + 1
self.seimag(img_url, title, i, path)
self.printf(“结束:”)
self.printf(url)
else:
pass
if __name__==”__main__”:
import sys
app=QtWidgets.QApplication(sys.argv)
widget=QtWidgets.QWidget()
ui=Ui_Form()
ui.setupUi(widget)
widget.show()
sys.exit(app.exec_())
复制代码Python新手学习中
目前只试过小草社区
2楼:应该是用的python3吧,建议加上daili
3楼:求打包~~~~~~~~~~~~~~~~~~~~~~~~~
4楼:图图 发表于 2020211 00:51
求打包~~~~~~~~~~~~~~~~~~~~~~~~~
同求 哈哈 该买硬盘了
5楼:换电脑。买硬盘,这次一定要买大的
6楼:>>>
=================== RESTART: /Users/edison/Desktop/count 2.py ==================
Traceback (most recent call last):
File “/Users/edison/Desktop/count 2.py”, line 11, in
import requests
ModuleNotFoundError: No module named ‘requests’
>>>
7楼:老哥,运行了出现“q为退出
输入网址数列(,分割):”
然后怎么做,刚学的小白,还有些代码看不懂,求教
8楼:ginloveyou 发表于 2020211 17:18
>>>
=================== RESTART: /Users/edison/Desktop/count 2.py ==================
Traceback (mos …
兄弟,你是‘requests’这个模块没装,而且第七行‘PIL’那个库也要装,不过‘PIL’好像要对应Python的版本,不好装。装‘pillow’库也可以,完美运行
9楼:暖暖有点二 发表于 2020211 17:23
老哥,运行了出现“q为退出
输入网址数列(,分割):”
然后怎么做,刚学的小白,还有些代码看不懂,求教 …
输入小草帖子网址 做了个进程爬取 逗号分割将网址放到一个数组里了
10楼:liuyiliux 发表于 2020211 19:05
输入小草帖子网址 做了个进程爬取 逗号分割将网址放到一个数组里了
输入网址了,储存位置也输入了,还是不对。老哥麻烦你有空出个运行的步骤图吧
11楼: 评论区里大神多。。。果然见识了。。。。我啥都没看懂。。。。
12楼:如果打包成exe,命令行输入 地址 更好。
13楼:暖暖有点二 发表于 2020211 21:23
输入网址了,储存位置也输入了,还是不对。老哥麻烦你有空出个运行的步骤图吧 …
有报错吗 发来看看
打包好的图 https://pan.baidu.com/s/10tU6hh2TfkrF6kTqOLo2Hg 提取码: b4cf
14楼:from PIL import Image
from io import BytesIO
这两个没有用到
con = sqlite3.connect(“/home/test/PycharmProjects/MyPython/picture/Rul.db”)
数据库地址,应该单独提出来
getHtml1 getHtml2 getHtml3
函数名 应该 见名知意
response = requests.get(url=url,timeout=10,verify=False)
response.encoding = “GBK”
soup = BeautifulSoup(response.text, ‘html.parser’)
相同的内容应该封装 成为 函数
img_name = title + “” + str(i) + houzhui
这种复杂的 字符串拼接 请用字符串格式化 f’title{i}{houzhui}’
i = 0
for img_url in img_list_url:
i = i + 1
seimag(img_url, title, filename, i, path)
可以用 enumerate 函数
for i, img_url in enumerate(img_list_url):
seimag(img_url, title, filename, i, path)
还有很多问题,需要多多加强
另外不介意的话 分享一下 pyqt 的资料,一只说学一下,一直没时间去搞!
15楼:暖暖有点二 发表于 2020211 21:23
输入网址了,储存位置也输入了,还是不对。老哥麻烦你有空出个运行的步骤图吧 …
16楼:悦~ 发表于 2020211 21:52
from PIL import Image
from io import BytesIO
这两个没有用到
是的本来是用pil方法保存图片时候用的后来发现保存的图片大小要小几kb,考虑到下次判断是否下载重复去掉这个方法了
感谢指导
pyqt5 我按照这个网站例子改的http://code.py40.com/1948.html
17楼:liuyiliux 发表于 2020211 21:59
是的本来是用pil方法保存图片时候用的后来发现保存的图片大小要小几kb,考虑到下次判断是否下载重复去掉 …
好的 感谢!
另外再教你一个 自动 安装库的小技巧
try:
import requests
except:
os.system(‘pip install requests’)
import requests
18楼:悦~ 发表于 2020211 22:05
好的 感谢!
另外再教你一个 自动 安装库的小技巧
谢谢学习了
19楼:liuyiliux 发表于 2020211 22:07
谢谢学习了
还想起一个事,学爬虫 找 https://cuiqingcai.com/ 论坛也有他出的 爬虫教程,找找应该可以找到。
20楼:夜不能寐 发表于 2020211 21:32
如果打包成exe,命令行输入 地址 更好。
论坛里面之前有人发过一个通用的 好像叫作品图浏览采集 易语言的
21楼:liuyiliux 发表于 2020211 22:07
谢谢学习了
还想起一个事 pip install i https://pypi.tuna.tsinghua.edu.cn/simple requests
这样下载更快
22楼:小草是不是用Discuz啊,最近想把别人的贴子爬出来保存为word文档
23楼:能力有限,真的看不懂。
24楼:悦~ 发表于 2020211 21:52
from PIL import Image
from io import BytesIO
这两个没有用到
擦个楼,老哥说的有理。献丑了。
def getPage(url):
responese = requests.get(url,timeout=10,verify=False)
response.encoding = ‘GBK’
soup = BeautifulSoup(response.text,’html.parser’)
return soup
复制代码
25楼:Jayson 发表于 2020211 22:31
擦个楼,老哥说的有理。献丑了。
欢迎进来讨论!
我一般 命名 api_get 然后 不验证https 这些都作为参数传递进去,但是带默认值!
26楼:liuyiliux 发表于 2020211 21:34
有报错吗 发来看看
打包好的图 https://pan.baidu.com/s/10tU6hh2TfkrF6kTqOLo2Hg 提取码: b4cf …
大神密码是什么啊?
27楼:mmc517 发表于 2020212 01:54
大神密码是什么啊?
密码fulibus
28楼:liuyiliux 发表于 2020211 21:55
谢谢啦,果然评论区都是大佬,还得虚心多学习
29楼:图图 发表于 2020211 00:51
求打包~~~~~~~~~~~~~~~~~~~~~~~~~
打包帖子因为闪现违规被删了。。。。
30楼:所以是哪个帖子
