欧美free性护士vide0shd,老熟女,一区二区三区,久久久久夜夜夜精品国产,久久久久久综合网天天,欧美成人护士h版

首頁綜合 正文
目錄

柚子快報邀請碼778899分享:scrapy 快速爬取全站

柚子快報邀請碼778899分享:scrapy 快速爬取全站

http://yzkb.51969.com/

spider文件 ?

import scrapy

import requests

import parsel

import time

import pymysql

from scrapy import item

from scrapy.utils import spider

from selenium import webdriver

from selenium.webdriver.common.by import By

from selenium.webdriver.edge.service import Service

from selenium.webdriver.edge.options import Options

from selenium.webdriver.edge.service import Service

from selenium.webdriver.edge.options import Options # 使用無頭瀏覽器

from selenium.webdriver import EdgeOptions

from selenium.webdriver import ChromeOptions

from sqlalchemy import create_engine

import pandas as pd

from requests.exceptions import RequestException,Timeout

# from ..items import BookShopItem,BookidItem,Book_characterItem

from scrapy.item import Item, Field

from ..items import BookShopItem

from ..signals import item_processed

class GetBookSpider(scrapy.Spider):

name = "get_book"

allowed_domains = ["bqgui.cc"]

start_urls = ["https://bqgui.cc"]

def open_spider(self):

conn = pymysql.connect(

host='數(shù)據(jù)庫地址',

user='',

password='',

db='book_shop',

charset='utf8mb4'

)

return conn

def parse(self, response):

global book_real_name, characters_info, conn

# book_index = ['xuanhuan', 'wuxia', 'dushi', 'lishi', 'wangyou', 'kehuan', 'mm']

book_index = ['xuanhuan']

book_ids = []

service = Service(executable_path='chromedriver.exe')

driver = webdriver.Chrome(service=service)

# self.drop_drown()

conn = self.open_spider()

cursor = conn.cursor()

for i in range(len(book_index)):

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'}

url = f'https://bqgui.cc/{book_index[i]}'

driver.get(url=url)

driver.implicitly_wait(10)

driver.minimize_window()

# drop_drown(driver)

response2_text = driver.page_source

selector = parsel.Selector(response2_text)

book_name_href = selector.xpath('//div[@class="item"]/dl/dt/a/@href').getall()

yield BookShopItem(book_name_href=book_name_href)

return item # 返回更新后的item

def drop_drown(self,driver):

start = 0

for i in range(21):

js = f'document.documentElement.scrollTop={start}'

start += 1480

driver.execute_script(js)

time.sleep(0.5)

管道文件

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

# useful for handling different item types with a single interface

from itemadapter import ItemAdapter

from .items import BookShopItem

from requests.exceptions import RequestException,Timeout

import pymysql

import requests

import parsel

from sqlalchemy import create_engine

import pandas as pd

from scrapy import signals

from .signals import item_processed

# from .items import BookShopItem,BookidItem,Book_characterItem

from .items import BookShopItem

class BookShopPipeline:

def __init__(self):

self.conn = None

self.cursor = None

def open_spider(self, spider):

self.conn = pymysql.connect(

host='你數(shù)據(jù)庫ip地址',

user='',

password='',

db='book_shop',

charset='utf8mb4'

)

self.cursor = self.conn.cursor()

self.create_table()

def process_item(self, item, spider):

book_ids=[]

for j in range(len(item['book_name_href'])):

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'}

book_name_url = f'http://www.bqgui.cc{item['book_name_href'][j]}'

response = requests.get(book_name_url, headers=headers)

book_name_selector = parsel.Selector(response.text)

book_name = book_name_selector.xpath('//div[@class="info"]/h1/text()').get()

print(book_name)

if book_name is not None and book_name.strip():

sql = "INSERT INTO Books(BookName) VALUES(%s)"

self.cursor.execute(sql, (book_name,)) # 注意這里傳遞的是一個元組

book_id = self.cursor.lastrowid # 立即獲取 lastrowid

book_ids.append(book_id) # 存儲書籍 ID

self.conn.commit()

else:

print("這本書是空白的無法獲取")

continue

if item['book_name_href'] is not None and book_name.strip():

book_href = book_name_selector.xpath('//div[@class="listmain"]/dl/dd/a/@href').getall()

engine = create_engine('mysql+pymysql://root:root@{數(shù)據(jù)庫ip地址}/book_shop')

for z in range(len(book_href)):

try:

headers1 = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'}

response2 = requests.get(f'http://www.bqgui.cc{book_href[z]}', headers=headers1, timeout=10)

response2.raise_for_status() # 如果響應(yīng)狀態(tài)碼不是 200,則拋出 HTTPError

chapters = response2.text

chapters_title = parsel.Selector(chapters).xpath('//div[@class="content"]/h1/text()').get()

if chapters_title is not None and chapters_title.strip():

chapters_info = parsel.Selector(chapters).xpath(

'//div[@id="chaptercontent"]/text()').getall()

chapters_info_str = '\n'.join(chapters_info) # 將列表轉(zhuǎn)換為由換行符分隔的字符串

df = pd.DataFrame({

'BookID': [book_id],

'ChapterTitle': [chapters_title],

'ChapterContent': [chapters_info_str], # 使用轉(zhuǎn)換后的字符串

})

df.to_sql('Chapters', engine, if_exists='append', index=False)

print(chapters_title)

print("存入章節(jié)數(shù)據(jù)")

except Exception as e:

print("爬取下一章節(jié)數(shù)據(jù):")

continue

print("結(jié)束")

# conn.commit()

self.conn.close()

self.cursor.close()

return item

def create_table(self):

conn = pymysql.connect(

host='你數(shù)據(jù)庫ip地址',

user='',

password='',

db='book_shop',

charset='utf8mb4'

)

cursor = conn.cursor()

sql_create_books = '''

CREATE TABLE IF NOT EXISTS Books (

BookID INT AUTO_INCREMENT PRIMARY KEY,

BookName VARCHAR(255)

# NOT NULL ,

# UNIQUE (BookName) -- 假設(shè)書名是唯一的

) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

'''

cursor.execute(sql_create_books)

# 創(chuàng)建Chapters表,并通過外鍵與Books表關(guān)聯(lián)

sql_create_chapters = '''

CREATE TABLE IF NOT EXISTS Chapters (

ChapterID INT AUTO_INCREMENT PRIMARY KEY,

BookID INT NOT NULL,

ChapterTitle VARCHAR(255) NOT NULL,

ChapterContent TEXT,

FOREIGN KEY (BookID) REFERENCES Books(BookID) ON DELETE CASCADE

) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

'''

cursor.execute(sql_create_chapters)

conn.commit()

conn.close()

cursor.close()

?

柚子快報邀請碼778899分享:scrapy 快速爬取全站

http://yzkb.51969.com/

推薦閱讀

評論可見,查看隱藏內(nèi)容

本文內(nèi)容根據(jù)網(wǎng)絡(luò)資料整理,出于傳遞更多信息之目的,不代表金鑰匙跨境贊同其觀點和立場。

轉(zhuǎn)載請注明,如有侵權(quán),聯(lián)系刪除。

本文鏈接:http://gantiao.com.cn/post/19540858.html

發(fā)布評論

您暫未設(shè)置收款碼

請在主題配置——文章設(shè)置里上傳

掃描二維碼手機訪問

文章目錄