柚子快報邀請碼778899分享:scrapy 快速爬取全站
柚子快報邀請碼778899分享:scrapy 快速爬取全站
spider文件 ?
import scrapy
import requests
import parsel
import time
import pymysql
from scrapy import item
from scrapy.utils import spider
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options # 使用無頭瀏覽器
from selenium.webdriver import EdgeOptions
from selenium.webdriver import ChromeOptions
from sqlalchemy import create_engine
import pandas as pd
from requests.exceptions import RequestException,Timeout
# from ..items import BookShopItem,BookidItem,Book_characterItem
from scrapy.item import Item, Field
from ..items import BookShopItem
from ..signals import item_processed
class GetBookSpider(scrapy.Spider):
name = "get_book"
allowed_domains = ["bqgui.cc"]
start_urls = ["https://bqgui.cc"]
def open_spider(self):
conn = pymysql.connect(
host='數(shù)據(jù)庫地址',
user='',
password='',
db='book_shop',
charset='utf8mb4'
)
return conn
def parse(self, response):
global book_real_name, characters_info, conn
# book_index = ['xuanhuan', 'wuxia', 'dushi', 'lishi', 'wangyou', 'kehuan', 'mm']
book_index = ['xuanhuan']
book_ids = []
service = Service(executable_path='chromedriver.exe')
driver = webdriver.Chrome(service=service)
# self.drop_drown()
conn = self.open_spider()
cursor = conn.cursor()
for i in range(len(book_index)):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'}
url = f'https://bqgui.cc/{book_index[i]}'
driver.get(url=url)
driver.implicitly_wait(10)
driver.minimize_window()
# drop_drown(driver)
response2_text = driver.page_source
selector = parsel.Selector(response2_text)
book_name_href = selector.xpath('//div[@class="item"]/dl/dt/a/@href').getall()
yield BookShopItem(book_name_href=book_name_href)
return item # 返回更新后的item
def drop_drown(self,driver):
start = 0
for i in range(21):
js = f'document.documentElement.scrollTop={start}'
start += 1480
driver.execute_script(js)
time.sleep(0.5)
管道文件
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from .items import BookShopItem
from requests.exceptions import RequestException,Timeout
import pymysql
import requests
import parsel
from sqlalchemy import create_engine
import pandas as pd
from scrapy import signals
from .signals import item_processed
# from .items import BookShopItem,BookidItem,Book_characterItem
from .items import BookShopItem
class BookShopPipeline:
def __init__(self):
self.conn = None
self.cursor = None
def open_spider(self, spider):
self.conn = pymysql.connect(
host='你數(shù)據(jù)庫ip地址',
user='',
password='',
db='book_shop',
charset='utf8mb4'
)
self.cursor = self.conn.cursor()
self.create_table()
def process_item(self, item, spider):
book_ids=[]
for j in range(len(item['book_name_href'])):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'}
book_name_url = f'http://www.bqgui.cc{item['book_name_href'][j]}'
response = requests.get(book_name_url, headers=headers)
book_name_selector = parsel.Selector(response.text)
book_name = book_name_selector.xpath('//div[@class="info"]/h1/text()').get()
print(book_name)
if book_name is not None and book_name.strip():
sql = "INSERT INTO Books(BookName) VALUES(%s)"
self.cursor.execute(sql, (book_name,)) # 注意這里傳遞的是一個元組
book_id = self.cursor.lastrowid # 立即獲取 lastrowid
book_ids.append(book_id) # 存儲書籍 ID
self.conn.commit()
else:
print("這本書是空白的無法獲取")
continue
if item['book_name_href'] is not None and book_name.strip():
book_href = book_name_selector.xpath('//div[@class="listmain"]/dl/dd/a/@href').getall()
engine = create_engine('mysql+pymysql://root:root@{數(shù)據(jù)庫ip地址}/book_shop')
for z in range(len(book_href)):
try:
headers1 = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'}
response2 = requests.get(f'http://www.bqgui.cc{book_href[z]}', headers=headers1, timeout=10)
response2.raise_for_status() # 如果響應(yīng)狀態(tài)碼不是 200,則拋出 HTTPError
chapters = response2.text
chapters_title = parsel.Selector(chapters).xpath('//div[@class="content"]/h1/text()').get()
if chapters_title is not None and chapters_title.strip():
chapters_info = parsel.Selector(chapters).xpath(
'//div[@id="chaptercontent"]/text()').getall()
chapters_info_str = '\n'.join(chapters_info) # 將列表轉(zhuǎn)換為由換行符分隔的字符串
df = pd.DataFrame({
'BookID': [book_id],
'ChapterTitle': [chapters_title],
'ChapterContent': [chapters_info_str], # 使用轉(zhuǎn)換后的字符串
})
df.to_sql('Chapters', engine, if_exists='append', index=False)
print(chapters_title)
print("存入章節(jié)數(shù)據(jù)")
except Exception as e:
print("爬取下一章節(jié)數(shù)據(jù):")
continue
print("結(jié)束")
# conn.commit()
self.conn.close()
self.cursor.close()
return item
def create_table(self):
conn = pymysql.connect(
host='你數(shù)據(jù)庫ip地址',
user='',
password='',
db='book_shop',
charset='utf8mb4'
)
cursor = conn.cursor()
sql_create_books = '''
CREATE TABLE IF NOT EXISTS Books (
BookID INT AUTO_INCREMENT PRIMARY KEY,
BookName VARCHAR(255)
# NOT NULL ,
# UNIQUE (BookName) -- 假設(shè)書名是唯一的
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
'''
cursor.execute(sql_create_books)
# 創(chuàng)建Chapters表,并通過外鍵與Books表關(guān)聯(lián)
sql_create_chapters = '''
CREATE TABLE IF NOT EXISTS Chapters (
ChapterID INT AUTO_INCREMENT PRIMARY KEY,
BookID INT NOT NULL,
ChapterTitle VARCHAR(255) NOT NULL,
ChapterContent TEXT,
FOREIGN KEY (BookID) REFERENCES Books(BookID) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
'''
cursor.execute(sql_create_chapters)
conn.commit()
conn.close()
cursor.close()
?
柚子快報邀請碼778899分享:scrapy 快速爬取全站
推薦閱讀
本文內(nèi)容根據(jù)網(wǎng)絡(luò)資料整理,出于傳遞更多信息之目的,不代表金鑰匙跨境贊同其觀點和立場。
轉(zhuǎn)載請注明,如有侵權(quán),聯(lián)系刪除。