初识python | python作品分享 -大发黄金版app下载
import datetime
import re
import pymysql
import requests
from bs4 import beautifulsoup
def spider():
url = "https://www.bbiquge.net/"
html = requests.get(url)
html.encoding = 'gbk'
text = html.text
bs = beautifulsoup(text, 'lxml')
box = bs.select("#mainleft .titletop")
db = conn()
query = db.cursor()
for item in box:
category = item.select('h3')[0].string
time = datetime.datetime.now().strftime('%y-%m-%d %h:%m:%s')
sql = 'insert into category(name,created_at) values (%s,%s)'
query.execute(sql, (category, time))
insert_id = db.insert_id()
handler_top(item, insert_id, query, db)
li = item.select("ul li")
del li[:1]
for i in li:
book_id, link = handler_li(i, insert_id, query, db)
handler_chapter(book_id, link, query, db)
def handler_top(content, insert_id, query, db):
print("-----------开始采集top--------")
top = content.select("ul li")[0]
title = top.select(".text strong a")
name = title[0].string
link = title[0]['href']
author_str = top.select(".text p")
category_id = insert_id
pattern = re.compile("(?<=作者:).*?(?=
)")
s = str(author_str[0])
m = pattern.search(s)
author = m.group()
book_sql = 'insert into books(name,author,link,category_id) values (%s,%s,%s,%s)'
query.execute(book_sql, (name, author, link, category_id))
book_id = db.insert_id()
handler_chapter(book_id, link, query, db)
def handler_li(content, insert_id, query, db):
print("-----------开始采集书本名称--------")
name = content.select("a")[0].string
link = content.select("a")[0]['href']
category_id = insert_id
author = content.select("span", class_="author")[0].string
book_sql = 'insert into books(name,author,link,category_id) values (%s,%s,%s,%s)'
query.execute(book_sql, (name, author, link, category_id))
book_id = db.insert_id()
return book_id, link
def handler_chapter(book_id, link, query, db):
print("-----------开始采集章节内容--------" link)
page_html = requests.get(link)
page_text = page_html.text
bs = beautifulsoup(page_text, 'lxml')
pages = bs.find("select", "form-control").find_all("option")
for page in range(1, len(pages)):
url = link "index_" str(page) ".html"
print("-----------开始采集章节页码--------" url)
chapter_html = requests.get(url)
chapter_text = chapter_html.text
bs = beautifulsoup(chapter_text, 'lxml')
dd = bs.select("dl dd")
for d in dd:
href = d.select("a")[0]["href"]
url = link href
print("-----------开始采集内容--------" url)
headers = {
'user-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) '
'chrome/112.0.0.0 safari/537.36',
'x-client-data': "ckk1yqeiilbjaqiitskbcmg2yqeiqz3kaqj5k8sbcjshywei/krmaqic/swbciwgzqeivqlnaq=="
}
content_html = requests.get(url, headers=headers)
content_html.encoding = 'gbk'
content_text = content_html.text
bs = beautifulsoup(content_text, 'lxml')
article = bs.find("div", id="content").text
name = bs.find("h1").text
page_size = page
old_chapter = href.split(".", 1)[0]
lk = url
created_at = datetime.datetime.now().strftime('%y-%m-%d %h:%m:%s')
bid = book_id
content_sql = 'insert into chapter(name,link,old_chapter,content,page,created_at,book_id)' \
' values (%s,%s,%s,%s,%s,%s,%s)'
query.execute(content_sql, (name, lk, old_chapter, article, page_size, created_at, bid))
db.commit()
print("-----------采集完一条内容------------")
def conn():
try:
db = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
passwd='root',
db='stories',
charset='utf8'
)
return db
except exception as b:
print(b.args)
if __name__ == '__main__':
try:
spider()
except exception as e:
print(e.args)
下一步 学习使用多线程采集,下下一步学习做一个桌面工具采集