柚子快報激活碼778899分享:今天開始學(xué)爬蟲
柚子快報激活碼778899分享:今天開始學(xué)爬蟲
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = "https://movie.douban.com/top250"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
def fetch_page_data(url):
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
movie_items = soup.find_all('div', class_='item')
data = []
for item in movie_items:
rank = item.find('em').text
title = item.find('span', class_='title').text
link = item.find('a')['href']
image = item.find('img')['src']
rating = item.find('span', class_='rating_num').text
num_reviews = item.find('div', class_='star').find_all('span')[-1].text[:-3]
summary = item.find('span', class_='inq').text if item.find('span', class_='inq') else ""
# 獲取導(dǎo)演信息
director = item.find('p').text.split('\n')[1].strip().split(' ')[1]
data.append({
'Rank': rank,
'Title': title,
'Link': link,
'Image': image,
'Rating': rating,
'Number of Reviews': num_reviews,
'Summary': summary,
'Director': director
})
return data
all_data = []
for i in range(0, 250, 25):
url = f"{base_url}?start={i}&filter="
all_data.extend(fetch_page_data(url))
df = pd.DataFrame(all_data)
df.to_excel("Douban_Top_250_Movies.xlsx", index=False)
print("Data has been saved to Douban_Top_250_Movies.xlsx")
import pandas as pd
import matplotlib.pyplot as plt
# 加載數(shù)據(jù)
df = pd.read_excel("Douban_Top_250_Movies.xlsx", engine='openpyxl')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 查看數(shù)據(jù)規(guī)模及字段的數(shù)據(jù)類型
print("數(shù)據(jù)規(guī)模:", df.shape)
print("字段數(shù)據(jù)類型:\n", df.dtypes)
# 查看字段的統(tǒng)計信息
print("統(tǒng)計信息:\n", df.describe())
# 刪除缺失值記錄
clean_df = df.dropna()
print("處理后記錄數(shù):", clean_df.shape[0])
# 保存清洗后的數(shù)據(jù)
clean_df.to_excel("Douban_Top_250_Movies_Clean.xlsx", index=False)
# 可視化1: 任選10部電影名稱評分柱狀圖
plt.figure(figsize=(10, 5))
sample_movies = clean_df.sample(10)
plt.bar(sample_movies['Title'], sample_movies['Rating'], color='pink')
plt.xlabel('電影名稱')
plt.ylabel('評分')
plt.title('XX - 10部電影評分柱狀圖')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('Visualization_1.png')
plt.show()
# 可視化2: 任選4位導(dǎo)演上榜電影數(shù)量柱狀圖
directors = clean_df['Director'].value_counts().index[:4]
director_counts = clean_df['Director'].value_counts().values[:4]
plt.figure(figsize=(10, 5))
plt.bar(directors, director_counts, color='green')
plt.xlabel('導(dǎo)演')
plt.ylabel('上榜電影數(shù)量')
plt.title('從聰202209305 - 導(dǎo)演上榜電影數(shù)量柱狀圖')
plt.tight_layout()
plt.savefig('Visualization_2.png')
plt.show()
# 可視化3: 10部電影評論人數(shù)趨勢圖
sample_movies = clean_df.sample(10)
plt.figure(figsize=(10, 5))
plt.plot(sample_movies['Title'], sample_movies['Number of Reviews'], marker='o', linestyle='-', color='orange')
plt.xlabel('電影名稱')
plt.ylabel('評論人數(shù)')
plt.title('XX - 10部電影評論人數(shù)趨勢圖')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('Visualization_3.png')
plt.show()
柚子快報激活碼778899分享:今天開始學(xué)爬蟲
相關(guān)文章
本文內(nèi)容根據(jù)網(wǎng)絡(luò)資料整理,出于傳遞更多信息之目的,不代表金鑰匙跨境贊同其觀點和立場。
轉(zhuǎn)載請注明,如有侵權(quán),聯(lián)系刪除。