柚子快報邀請碼778899分享:頭歌:多線程、多進程爬蟲
step1/web/index.html文件下,將所有alt=""填入step1/images
step1/student.py文件源碼
import requests
from lxml import etree
import time
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from multiprocessing import Pool
import os
import threading
import psutil
# URL偽裝
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
}
def downimg(img_src):
start_time = time.time()
name = img_src.split('/')[-1].split('.')[0]
img_url = "http://127.0.0.1:8080" + img_src
img = requests.get(img_url)
dir_path = 'step1/images'
if not os.path.exists(dir_path):
os.makedirs(dir_path)
img_path = dir_path + '/' + name + '.jpg' # 圖片的最終存儲路徑
print(img_url, name + '.jpg', '開始下載。。。')
thread = threading.currentThread()
process = psutil.Process(os.getpid())
print("線程ID:%s, 進程ID:%s"
% (thread.ident, process.pid))
#********** Begin *********#
"""保存圖片"""
with open(img_path, 'wb')as file:
file.write(img.content)
#********** End *********#
finisTime = time.time() - start_time
print(name + ".jpg 用時為:" + str(finisTime) + " second")
def parsePage():
url = "http://127.0.0.1:8080/imgs/"
response = requests.get(url=url, headers=header)
html_content = response.text
#********** Begin *********#
"""解析網頁"""
html = etree.HTML(html_content)
item_list = html.xpath("http://div[@class='box']/div/a/img/@src")
print(item_list)
s_time = time.time()
#********** End *********#
"""非線程操作"""
# for item in item_list:
# downimg(item)
#********** Begin *********#
"""線程操作方式"""
thread = []
for item in item_list:
thread.append(threading.Thread(target=downimg, args=(item, )))
for t in thread:
t.start()
for t in thread:
t.join()
#********** End *********#
print('總耗時: %s' % (time.time() - s_time))
希望可以給到幫助
柚子快報邀請碼778899分享:頭歌:多線程、多進程爬蟲
精彩鏈接
本文內容根據網絡資料整理,出于傳遞更多信息之目的,不代表金鑰匙跨境贊同其觀點和立場。
轉載請注明,如有侵權,聯(lián)系刪除。