柚子快報(bào)激活碼778899分享：網(wǎng)絡(luò)爬蟲 scrapy 初級(jí)

Mintifi國(guó)際明智購(gòu)綜合2025-08-21540

http://yzkb.51969.com/

scrapy 補(bǔ)充方法

路徑拼接

? ? ? responses.urljoin(不完整的鏈接)

多字典時(shí)使用

? ? ?管道中判斷是否為某一個(gè)字典

? ? ? ? ? ?if isinstance(item,字典名)

? ? ? ? ? ? ? ? ? 保存

? ? ? ? ? ?return item

判斷網(wǎng)站ip類型，從而掛ip類型

????????if request.url.startswitch(''http://"):

? ? ? ? ????????request.meta["proxy"]='http://+ip'

????????elif ...........('https://'):

? ? ? ? ????????......='https://+ip'

pipeline 管道

? ? ? class 管道名(object)：

? ? ? ? ? ? def __init__(self):

? ? ? ? ? ? ? ? ? ? ? ? self.文件=open("文件名.后綴"，'寫入方式'，encoding="utf_8")

? ? ? ? ? ? def open_spider(self,spider):

? ? ? ? ? ? ? ? ? ? ? ? ?self.文件=open("文件名.后綴"，'寫入方式'，encoding="utf_8")

? ? ?2.? ? ? ? ? ? ?def process_item(self,spider):

?????????????????????????????????#??管道中判斷是否為某一個(gè)字典

? ? ? ? ? ?????????????????????????if isinstance(item,字典名)

? ? ? ? ? ? ? ? ? ????????????????????????保存

? ? ? ? ? ?????????????????????????return item

? ? ? ? ? ? ? ? ? ? ? ? ? ?if isinstance(item, 字典名): ??????????

??????????????????????????????????????data = dict(item) ????????

??????????????????????????????????????self.file.write(json.dumps(data, ensure_ascii=False) + ',\n')?????????

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?#ensure_ascii=False：這是因?yàn)閖son.dumps?序列化時(shí)對(duì)中文默認(rèn)使用的ascii編碼.想輸出真正的中文需要指定ensure_ascii=False

?????????????????????????????????????

? ? 3.? ? ? ? ? ? ? def close_spider(self,spider):

? ? ? ? ? ? ? ? ? ? ? ? ? ??self.文件.close()#關(guān)閉文件

? ? 3.? ? ? ? ? ? ? def __del__(self):

? ? ? ? ? ? ? ? ? ? ? ? ? ?self.文件.close()#關(guān)閉文件

setting文件

代理及user_agent

DOWNLOADER_MIDDLEWARES = { ??? # 'demo_58.middlewares.Demo58DownloaderMiddleware': 543, ????#隨機(jī)ua ??? 'demo_58.middlewares.UserAgentDownloadMiddleware': 543,?? ?????#隨機(jī)ip ????'demo_58.middlewares.RandomProxy': 542, }

請(qǐng)求頭

DEFAULT_REQUEST_HEADERS = { ??? 'accept-encoding': 'gzip, deflate, br', ??? 'accept-language': 'zh-CN,zh;q=0.9', ??? 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', ??? 'Accept-Language': 'en', ??? 'referer': 'https://cs.58.com/chuzu/?PGTID=0d100000-0019-e694-7c7e-9295d99e15c1&ClickID=2' }

middlewares文件

隨機(jī)ip

class RandomProxy:

ip_list = [

'124.116.116.13:4228',

'122.194.194.139:4212',

'36.42.248.45:4215',

'1.83.250.183:4228',

'49.85.43.175:4223',

'121.205.229.70:4231',

]

??? # 方法名是scrapy規(guī)定的方法（協(xié)商機(jī)制）

def process_request(self, request, spider):

proxy = random.choice(self.ip_list)

# 修改請(qǐng)求的元數(shù)據(jù)字典

# 如果是將IP以列表隨機(jī)形式構(gòu)造需要加上https://,否則報(bào)錯(cuò)

request.meta['proxy'] = 'https://' + proxy

# 如果是將IP以字典形式構(gòu)造

print('IP:', request.meta)

隨機(jī)user_agent

class UserAgentDownloadMiddleware:

user_agent = [

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) ????????Chrome/14.0.835.163 Safari/535.1',

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) ????????Version/5.1 Safari/534.50',

'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0',

'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like ????????Gecko) Chrome/70.0.3538.77 Safari/537.36',

'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16.2'

]

# 方法名是scrapy規(guī)定的方法（協(xié)商機(jī)制）

# 每個(gè)交給下載器的request對(duì)象都會(huì)經(jīng)過該方法，并期望返回response

def process_request(self, request, spider):

# 獲取隨機(jī)請(qǐng)求頭

u_a = random.choice(self.user_agent)

# 設(shè)置請(qǐng)求頭

request.headers['User-Agent'] = u_a

item字典

# 租房

class Demo58Item_zufang(scrapy.Item):

# define the fields for your item here like:

title = scrapy.Field() # 租房標(biāo)題

price = scrapy.Field() # 租房?jī)r(jià)格

# 二手房

class Demo58Item_ershou(scrapy.Item):

# define the fields for your item here like:

title = scrapy.Field() # 二手標(biāo)題

price = scrapy.Field() # 二手價(jià)格

spider文件

import scrapy 導(dǎo)入字典 from demo_58.items import Demo58Item_zufang,Demo58Item_ershou

class SpiderSpider(scrapy.Spider): ??? name = 'spider'

????????#限制域名 ??? allowed_domains = ['58.com']

????????#限制網(wǎng)址 ??? start_urls = ['http://58.com/']

??? def parse(self, response): ??????? # 解析鏈接 ??????? links = response.xpath('//div[@class="board"]//span[@class="contentAdTilRt"]/a/@href').extract() ? ? ? ??

??????? for link in links: ??????????? # response.urljoin() 自動(dòng)拼接鏈接 ??????????? href = response.urljoin(link)

? ? ? ? ? ? ? ? #指定爬取的子網(wǎng)頁(yè) ??????????? if '58.com/chuzu/' in href: ??????????????? yield scrapy.Request(url=href,callback=self.get_zufang_data) ??????????? if '58.com/ershoufang/' in href: ??????????????? yield scrapy.Request(url=href, callback=self.get_ershoufang_data)

??? '''解析信息''' ??? def get_zufang_data(self,response): ??????? title? = response.xpath('//div[@class="des"]/h2/a/text()').extract()? # 標(biāo)題 ??????? preice? = response.xpath('//div[@class="list-li-right"]//div[@class="money"]/b/text()').extract()? ????????# 價(jià)格 ??????? ?

????????for titles,preices in zip(title,preice): ??????????? # 實(shí)例化租房的item類 ??????????? zufang_item = Demo58Item_zufang()

??????????? zufang_item['title'] =titles ??????????? zufang_item['price'] =preices ??????????? # 返回給引擎 ??????????? yield zufang_item

??????? # 構(gòu)造翻頁(yè) ??????? for i in range(2,12): ??????????? print('當(dāng)前正在下載租房的第{}頁(yè)'.format(i)) ??????????? z_next_url = 'https://cs.58.com/chuzu/pn{}/'.format(i) ??????????? # 將翻頁(yè)Url打包成請(qǐng)求對(duì)象給引擎

? ? ? ? ? ? ? ? #callback=交給誰(shuí)去執(zhí)行 ??????????? yield scrapy.Request(url=z_next_url,callback=self.get_zufang_data)

??? '''解析信息''' ??? def get_ershoufang_data(self,response): ??????? title = response.xpath('//div[@class="property-content-detail"]/div[@class="property-content-title"]/h3/text()').extract()? # 標(biāo)題 ??????? preice = response.xpath('//p[@class="property-price-total"]/span[@class="property-price-total-num"]/text()').extract()? # 價(jià)格 ???????

???????? for titles, preices in zip(title, preice): ??????????? # 實(shí)例化二手房的Item類 ??????????? ershou_item = Demo58Item_ershou() ??????????? ershou_item['title'] = titles.strip() ??????????? ershou_item['price'] = preices ??????????? # 返回給引擎 ??????????? yield ershou_item

??????? # 開始二手房翻頁(yè) ??????? for i in range(2,12): ??????????? print('當(dāng)前正在下載二手房的第{}頁(yè)'.format(i)) ??????????? next_url = 'https://cs.58.com/ershoufang/p{}/'.format(i) ??????????? # 將翻頁(yè)Url打包成請(qǐng)求對(duì)象給引擎 ??????????? yield scrapy.Request(url=next_url, callback=self.get_ershoufang_data)

if __name__ == '__main__':

? ? ? ? #啟動(dòng)文件 ??? from scrapy import cmdline ??? cmdline.execute(['scrapy', 'crawl', 'spider'])

柚子快報(bào)激活碼778899分享：網(wǎng)絡(luò)爬蟲 scrapy 初級(jí)

http://yzkb.51969.com/

參考鏈接

評(píng)論可見，查看隱藏內(nèi)容

本文內(nèi)容根據(jù)網(wǎng)絡(luò)資料整理，出于傳遞更多信息之目的，不代表金鑰匙跨境贊同其觀點(diǎn)和立場(chǎng)。

轉(zhuǎn)載請(qǐng)注明，如有侵權(quán)，聯(lián)系刪除。

本文鏈接：http://gantiao.com.cn/post/19115937.html