爬虫小程序分享爬取小草单个帖子

#coding=utf8

import math

import os

from threading import Thread

import sqlite3

import requests

from PIL import Image

from io import BytesIO

from bs4 import BeautifulSoup

def

爬虫小程序分享爬取小草单个帖子

#coding=utf8

import math

import os

from threading import Thread

import sqlite3

import requests

from PIL import Image

from io import BytesIO

from bs4 import BeautifulSoup

def getHtml(url):

try:

response = requests.get(url=url,timeout=10,verify=False)

response.encoding = “GBK”

soup = BeautifulSoup(response.text, ‘html.parser’)

html = soup.select_one(“.tpc_content”)

list_img = html.find_all(‘img’)

list_img_url = []

for imgone in list_img:

img_url = imgone.get(‘datasrc’)

list_img_url.append(img_url)

return list_img_url

except:

print(“解析:”+url+”失败”)

return []

def gettitle(url):

response = requests.get(url=url, timeout=10, verify=False)

response.encoding = “GBK”

soup = BeautifulSoup(response.text, ‘html.parser’)

title = soup.find(“title”).text.split(” “)[0]

return title

def getHtml2(url):

try:

response = requests.get(url=url,timeout=10,verify=False)

response.encoding = “GBK”

soup = BeautifulSoup(response.text, ‘html.parser’)

html = soup.select_one(“.tpc_content”)

list_img = html.find_all(‘input’, {“type”: “image”})

list_img_url = []

for imgone in list_img:

img_url = imgone.get(‘datasrc’)

list_img_url.append(img_url)

return list_img_url

except:

print(“解析:”+url+”失败”)

return []

def getHtml3(url):

try:

response = requests.get(url=url,timeout=10,verify=False)

response.encoding = “GBK”

soup = BeautifulSoup(response.text, ‘html.parser’)

list_img = soup.find_all(‘input’, {“type”: “image”})

if len(list_img) > 2:

del list_img[0]

list_img_url = []

for imgone in list_img:

img_url = imgone.get(‘datasrc’)

list_img_url.append(img_url)

return list_img_url

except:

print(“解析:”+url+”失败”)

return []

def getHtml4(url):

try:

response = requests.get(url=url,timeout=10,verify=False)

response.encoding = “GBK”

soup = BeautifulSoup(response.text, ‘html.parser’)

list_img2 = soup.find_all(‘img’)

list_img_url = []

for imgone in list_img2:

img_url = imgone.get(‘datasrc’)

if img_url is not None:

list_img_url.append(img_url)

if len(list_img_url)>2:

del list_img_url[0]

return list_img_url

except:

print(“解析:”+url+”失败”)

return []

def myse(path,img_url):

try:

print(“开始保存:”+path)

response = requests.get(img_url, timeout=10, verify=False)

# image = Image.open(BytesIO(response.content))

# image.se(path)

with open(path, ‘wb’) as f:

f.write(response.content)

print(“保存成功:” + path)

except Exception as e:

print(“保存失败:” + path + “:” + img_url)

def seimag(img_url,title,filename,i,path):

houzhui=”.”+img_url.split(“.”)[1]

if filename==””:

img_name=title + “” + str(i)+houzhui

path = path+img_name

sqliteDb(img_name, title, img_url)

if os.path.isfile(path):

if os.path.getsize(path) >0:

print(“已存在:” + path)

else:

myse(path,img_url)

else:

myse(path,img_url)

else:

img_name = title + “” + str(i) + houzhui

path = path + img_name

sqliteDb(img_name, title, img_url)

if os.path.isfile(path):

if os.path.getsize(path) > 0:

print(“已存在:” + path)

else:

myse(path, img_url)

def urlrun(url,filename,path):

filename=””

if getHtml(url) is not None:

img_list_url = getHtml(url)

img_list_len=len(img_list_url)

print(“第一种方法图片个数:”+str(img_list_len))

if img_list_len<=2:

img_list_url = getHtml2(url)

img_list_len = len(img_list_url)

print(“第二种方法图片个数:” + str(img_list_len))

if img_list_len<=2:

img_list_url = getHtml3(url)

img_list_len = len(img_list_url)

print(“第三种方法图片个数:” + str(img_list_len))

if img_list_len<=2:

img_list_url = getHtml4(url)

img_list_len = len(img_list_url)

print(“第四种方法图片个数:” + str(img_list_len))

if img_list_len <= 2:

print(url)

print(“有问题”)

title=gettitle(url)

print(“标题为:” + title)

i = 0

for img_url in img_list_url:

i = i + 1

seimag(img_url, title, filename, i,path)

# w, h = image.size

# image.thumbnail((w // 2, h // 2))

print(“结束:”)

print(url)

else:

pass

def threadrun(list_url2,path):

for url in list_url2:

urlrun(url,None,path)

def sqliteDb(name,title,url):

try:

con = sqlite3.connect(“/home/test/PycharmProjects/MyPython/picture/Rul.db”) # 数据库所在位置

cur = con.cursor()

# cur.execute(“CREATE TABLE IF NOT EXISTS caoliu(img_name TEXT PRIMARY KEY,title TEXT,imag_url TEXT)”)

sql = “select * from caoliu where img_name=?”

cur.execute(sql, (name,))

values = cur.fetchone()

if values is not None:

cur.execute(“UPDATE caoliu SET img_url=? WHERE img_name=?”, (url, name))

else:

data = “‘” + name + “‘,'” + title + “‘,'” + url + “‘”

cur.execute(‘INSERT INTO caoliu (img_name,title,img_url) VALUES (%s)’ % data)

con.commit()

# cur.execute(“SELECT * FROM caoliu”)

# for item in cur:

# print(item)

cur.close()

con.close()

except:

pass

if __name__ == ‘__main__’:

requests.packages.urllib3.disable_warnings()

print(“q为退出”)

while True:

list_url_str = input(“输入网址数列(,分割):”)

if list_url_str==”q”:

quit(0)

list_url = list_url_str.split(“,”)

path = input(“保存地址:”)

if path==”q”:

quit(0)

path = path.replace(” “, “”)

if path==””:

path=”/home/test/Pictures/picture/” ##默认保存地址

if “/” is not path[1]:

path = path + “/”

lens = len(list_url)

if lens >= 20:

size = (lens / 10)

size = math.ceil(size)

list_url2 = []

for i in range(0, 10):

list_url2 = list_url[i * size:i * size + size]

t = Thread(target=threadrun, args=(list_url2, path))

t.start()

t.join()

print(“全部结束”)

else:

size = (lens / 4)

size = math.ceil(size)

list_url2 = []

for i in range(0, 4):

list_url2 = list_url[i * size:i * size + size]

t = Thread(target=threadrun, args=(list_url2, path))

t.start()

t.join()

print(“全部结束”)

复制代码/home/test/PycharmProjects/MyPython/picture/Rul.db 为我的数据库地址 请修改为自己地址,数据库仅为存图片链接用 。

二:pyqt5版本

# * coding: utf8 *

# Form implementation generated from reading ui file ‘爬取pyqt测试.ui’

#

# Created by: PyQt5 UI code generator 5.13.2

#

# WARNING! All changes made in this file will be lost!

import os

import re

import requests

from PyQt5 import QtCore, QtGui, QtWidgets

from bs4 import BeautifulSoup

class Ui_Form(object):

def setupUi(self, Form):

Form.setObjectName(“Form”)

Form.resize(738, 611)

self.paquButton = QtWidgets.QPushButton(Form)

self.paquButton.setGeometry(QtCore.QRect(600, 230, 91, 31))

self.paquButton.setObjectName(“paquButton”)

self.label = QtWidgets.QLabel(Form)

self.label.setGeometry(QtCore.QRect(110, 70, 68, 23))

font = QtGui.QFont()

font.setPointSize(14)

self.label.setFont(font)

self.label.setObjectName(“label”)

self.lineEdit_url = QtWidgets.QLineEdit(Form)

self.lineEdit_url.setGeometry(QtCore.QRect(190, 70, 381, 31))

self.lineEdit_url.setObjectName(“lineEdit_url”)

self.lineEdit_path = QtWidgets.QLineEdit(Form)

self.lineEdit_path.setGeometry(QtCore.QRect(210, 230, 311, 31))

self.lineEdit_path.setObjectName(“lineEdit_path”)

self.label_2 = QtWidgets.QLabel(Form)

self.label_2.setGeometry(QtCore.QRect(110, 230, 101, 23))

font = QtGui.QFont()

font.setPointSize(14)

self.label_2.setFont(font)

self.label_2.setObjectName(“label_2”)

self.textBrowser_print = QtWidgets.QTextBrowser(Form)

self.textBrowser_print.setGeometry(QtCore.QRect(110, 290, 581, 241))

self.textBrowser_print.setObjectName(“textBrowser_print”)

self.lineEdit_guize = QtWidgets.QLineEdit(Form)

self.lineEdit_guize.setGeometry(QtCore.QRect(210, 150, 311, 31))

self.lineEdit_guize.setObjectName(“lineEdit_guize”)

self.label_3 = QtWidgets.QLabel(Form)

self.label_3.setGeometry(QtCore.QRect(110, 150, 91, 23))

font = QtGui.QFont()

font.setPointSize(14)

self.label_3.setFont(font)

self.label_3.setObjectName(“label_3”)

self.comboBox_yemian = QtWidgets.QComboBox(Form)

self.comboBox_yemian.setGeometry(QtCore.QRect(610, 70, 100, 31))

self.comboBox_yemian.setObjectName(“comboBox_yemian”)

self.comboBox_yemian.addItem(“”)

self.comboBox_yemian.addItem(“”)

self.label_4 = QtWidgets.QLabel(Form)

self.label_4.setGeometry(QtCore.QRect(540, 150, 91, 23))

self.label_4.setFont(font)

self.label_4.setObjectName(“label_4”)

self.lineEdit_shuxing = QtWidgets.QLineEdit(Form)

self.lineEdit_shuxing.setGeometry(QtCore.QRect(630, 150, 61, 31))

self.lineEdit_shuxing.setObjectName(“lineEdit_shuxing”)

self.retranslateUi(Form)

self.paquButton.clicked.connect(self.paqu)

QtCore.QMetaObject.connectSlotsByName(Form)

def retranslateUi(self, Form):

_translate = QtCore.QCoreApplication.translate

Form.setWindowTitle(_translate(“Form”, “图片爬取 by怪盗LYL”))

self.paquButton.setText(_translate(“Form”, “爬取”))

self.label.setText(_translate(“Form”, “网址:”))

self.label_2.setText(_translate(“Form”, “保存地址:”))

self.label_3.setText(_translate(“Form”, “解析规则:”))

self.comboBox_yemian.setItemText(0, _translate(“Form”, “单个页面爬取”))

self.comboBox_yemian.setItemText(1, _translate(“Form”, “需要翻页”))

self.label_4.setText(_translate(“Form”, “图片属性:”))

self.lineEdit_shuxing.setText(_translate(“Form”, ‘datasrc’))

def paqu(self):

requests.packages.urllib3.disable_warnings()

path=self.lineEdit_path.text()

url=self.lineEdit_url.text()

guize=self.lineEdit_guize.text()

yemian=str(self.comboBox_yemian.currentIndex())

# self.printf(“path:”+path)

# self.printf(“url:”+url)

# self.printf(“guize:”+guize)

# self.printf(“yemian:”+yemian)

if “0”==yemian:

self.urlrun(url=url,path=path)

def printf(self, mypstr):

self.textBrowser_print.append(mypstr) # 在指定的区域显示提示信息

self.cursor = self.textBrowser_print.textCursor()

self.textBrowser_print.moveCursor(self.cursor.End) # 光标移到最后,这样就会自动显示出来

QtWidgets.QApplication.processEvents() # 一定加上这个功能,不然有卡顿

def getHtml(self,url,shuxing,guize):

try:

response = requests.get(url=url, timeout=10, verify=False)

response.encoding = “GBK”

soup = BeautifulSoup(response.text, ‘html.parser’)

# print(soup.contents)

# print(shuxing)

# print(self.lineEdit_guize)

pattern = re.compile(guize, re.S)

item_list = pattern.findall(str(soup.contents))

img_list = []

for i in item_list:

# print(i)

for b in i:

if shuxing in b:

shuxing_pipei = shuxing + ‘=”(.*?)”‘

pattern = re.compile(shuxing_pipei, re.S)

img_url = pattern.findall(b)

# print(img_url)

img_list.append(img_url[0])

return img_list

except Exception as e:

self.printf(e.__str__())

self.printf(“解析:” + url + “失败”)

return None

def gettitle(self,url):

try:

response = requests.get(url=url, timeout=10, verify=False)

response.encoding = “GBK”

soup = BeautifulSoup(response.text, ‘html.parser’)

title = soup.find(“title”).text.split(” “)[0]

return title

except Exception as e:

self.printf(e.__str__())

return None

def myse(self,path, img_url):

try:

self.printf(“开始保存:” + path)

response = requests.get(img_url, timeout=10, verify=False)

# image = Image.open(BytesIO(response.content))

# image.se(path)

with open(path, ‘wb’) as f:

f.write(response.content)

self.printf(“保存成功:” + path)

except Exception as e:

self.printf(e.__str__())

self.printf(“保存失败:” + path + “:” + img_url)

def seimag(self,img_url, title, i, path):

houzhui = “.” + img_url.split(“.”)[1]

img_name = title + “” + str(i) + houzhui

if not os.path.isdir(path):

os.mkdir(path)

path = path + img_name

if os.path.isfile(path):

if os.path.getsize(path) > 0:

self.printf(“已存在:” + path)

else:

self.myse(path, img_url)

else:

self.myse(path, img_url)

def urlrun(self,url, path):

if “/” is not path[1]:

path = path + “/”

if self.getHtml(url,shuxing=self.lineEdit_shuxing.text(),guize=self.lineEdit_guize.text()) is not None:

img_list_url = self.getHtml(url,shuxing=self.lineEdit_shuxing.text(),guize=self.lineEdit_guize.text())

img_list_len = len(img_list_url)

if img_list_len == 0:

self.printf(“图片个数:” + str(img_list_len))

self.printf(“结束:”)

self.printf(url)

else:

self.printf(“图片个数:” + str(img_list_len))

if self.gettitle(url) is not None:

title = self.gettitle(url)

else:

title = “liuyiliux:标题获取失败”

self.printf(“标题为:” + title)

i = 0

for img_url in img_list_url:

i = i + 1

self.seimag(img_url, title, i, path)

self.printf(“结束:”)

self.printf(url)

else:

pass

if __name__==”__main__”:

import sys

app=QtWidgets.QApplication(sys.argv)

widget=QtWidgets.QWidget()

ui=Ui_Form()

ui.setupUi(widget)

widget.show()

sys.exit(app.exec_())

复制代码Python新手学习中

目前只试过小草社区

2楼:应该是用的python3吧,建议加上daili

3楼:求打包~~~~~~~~~~~~~~~~~~~~~~~~~

4楼:图图 发表于 2020211 00:51

求打包~~~~~~~~~~~~~~~~~~~~~~~~~

同求 哈哈 该买硬盘了

5楼:换电脑。买硬盘,这次一定要买大的

6楼:>>>

=================== RESTART: /Users/edison/Desktop/count 2.py ==================

Traceback (most recent call last):

File “/Users/edison/Desktop/count 2.py”, line 11, in

import requests

ModuleNotFoundError: No module named ‘requests’

>>>

7楼:老哥,运行了出现“q为退出

输入网址数列(,分割):”

然后怎么做,刚学的小白,还有些代码看不懂,求教

8楼:ginloveyou 发表于 2020211 17:18

>>>

=================== RESTART: /Users/edison/Desktop/count 2.py ==================

Traceback (mos …

兄弟,你是‘requests’这个模块没装,而且第七行‘PIL’那个库也要装,不过‘PIL’好像要对应Python的版本,不好装。装‘pillow’库也可以,完美运行

9楼:暖暖有点二 发表于 2020211 17:23

老哥,运行了出现“q为退出

输入网址数列(,分割):”

然后怎么做,刚学的小白,还有些代码看不懂,求教 …

输入小草帖子网址 做了个进程爬取 逗号分割将网址放到一个数组里了

10楼:liuyiliux 发表于 2020211 19:05

输入小草帖子网址 做了个进程爬取 逗号分割将网址放到一个数组里了

输入网址了,储存位置也输入了,还是不对。老哥麻烦你有空出个运行的步骤图吧

11楼: 评论区里大神多。。。果然见识了。。。。我啥都没看懂。。。。

12楼:如果打包成exe,命令行输入 地址 更好。

13楼:暖暖有点二 发表于 2020211 21:23

输入网址了,储存位置也输入了,还是不对。老哥麻烦你有空出个运行的步骤图吧 …

有报错吗 发来看看

打包好的图 https://pan.baidu.com/s/10tU6hh2TfkrF6kTqOLo2Hg 提取码: b4cf

14楼:from PIL import Image

from io import BytesIO

这两个没有用到

con = sqlite3.connect(“/home/test/PycharmProjects/MyPython/picture/Rul.db”)

数据库地址,应该单独提出来

getHtml1 getHtml2 getHtml3

函数名 应该 见名知意

response = requests.get(url=url,timeout=10,verify=False)

response.encoding = “GBK”

soup = BeautifulSoup(response.text, ‘html.parser’)

相同的内容应该封装 成为 函数

img_name = title + “” + str(i) + houzhui

这种复杂的 字符串拼接 请用字符串格式化 f’title{i}{houzhui}’

i = 0

for img_url in img_list_url:

i = i + 1

seimag(img_url, title, filename, i, path)

可以用 enumerate 函数

for i, img_url in enumerate(img_list_url):

seimag(img_url, title, filename, i, path)

还有很多问题,需要多多加强

另外不介意的话 分享一下 pyqt 的资料,一只说学一下,一直没时间去搞!

15楼:暖暖有点二 发表于 2020211 21:23

输入网址了,储存位置也输入了,还是不对。老哥麻烦你有空出个运行的步骤图吧 …

16楼:悦~ 发表于 2020211 21:52

from PIL import Image

from io import BytesIO

这两个没有用到

是的本来是用pil方法保存图片时候用的后来发现保存的图片大小要小几kb,考虑到下次判断是否下载重复去掉这个方法了

感谢指导

pyqt5 我按照这个网站例子改的http://code.py40.com/1948.html

17楼:liuyiliux 发表于 2020211 21:59

是的本来是用pil方法保存图片时候用的后来发现保存的图片大小要小几kb,考虑到下次判断是否下载重复去掉 …

好的 感谢!

另外再教你一个 自动 安装库的小技巧

try:

import requests

except:

os.system(‘pip install requests’)

import requests

18楼:悦~ 发表于 2020211 22:05

好的 感谢!

另外再教你一个 自动 安装库的小技巧

谢谢学习了

19楼:liuyiliux 发表于 2020211 22:07

谢谢学习了

还想起一个事,学爬虫 找 https://cuiqingcai.com/ 论坛也有他出的 爬虫教程,找找应该可以找到。

20楼:夜不能寐 发表于 2020211 21:32

如果打包成exe,命令行输入 地址 更好。

论坛里面之前有人发过一个通用的 好像叫作品图浏览采集 易语言的

21楼:liuyiliux 发表于 2020211 22:07

谢谢学习了

还想起一个事 pip install i https://pypi.tuna.tsinghua.edu.cn/simple requests

这样下载更快

22楼:小草是不是用Discuz啊,最近想把别人的贴子爬出来保存为word文档

23楼:能力有限,真的看不懂。

24楼:悦~ 发表于 2020211 21:52

from PIL import Image

from io import BytesIO

这两个没有用到

擦个楼,老哥说的有理。献丑了。

def getPage(url):

responese = requests.get(url,timeout=10,verify=False)

response.encoding = ‘GBK’

soup = BeautifulSoup(response.text,’html.parser’)

return soup

复制代码

25楼:Jayson 发表于 2020211 22:31

擦个楼,老哥说的有理。献丑了。

欢迎进来讨论!

我一般 命名 api_get 然后 不验证https 这些都作为参数传递进去,但是带默认值!

26楼:liuyiliux 发表于 2020211 21:34

有报错吗 发来看看

打包好的图 https://pan.baidu.com/s/10tU6hh2TfkrF6kTqOLo2Hg 提取码: b4cf …

大神密码是什么啊?

27楼:mmc517 发表于 2020212 01:54

大神密码是什么啊?

密码fulibus

28楼:liuyiliux 发表于 2020211 21:55

谢谢啦,果然评论区都是大佬,还得虚心多学习

29楼:图图 发表于 2020211 00:51

求打包~~~~~~~~~~~~~~~~~~~~~~~~~

打包帖子因为闪现违规被删了。。。。

30楼:所以是哪个帖子

(0)

相关推荐