Python爬豆瓣电影

几年前搞的豆瓣爬虫,拯救电影慌,爬了下列表、评分、演员导演表等等信息,但是讽刺的是,这几年几乎快告别电影了,在家也不怎么看也不怎么跑电影院。偶尔和电影接触还是在抖音“N分钟看完……”。有些东西还是要慢慢品味吧。代码分享一下,很早发到github上了,有需要的朋友自取:https://github.com/Jackielzq/douban_movie

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import requests
import pandas as pd
from bs4 import BeautifulSoup
import json
import re

#初始化变量
list_url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=rank&page_limit=500&page_start=0'
url = 'https://movie.douban.com/subject/1292052/?tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&from=gaia_video'


def get_movie_list(url):

res = requests.get(url)
df = pd.DataFrame(json.loads(res.text)['subjects'])
return df


def get_movie_info(url): #获取具体某一电影的详细信息

header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36',
'Connection': 'keep-alive',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3' ,
'Referer': 'https://www.douban.com/'
}

res = requests.get(url = url, headers=header)
soup = BeautifulSoup(res.text,'html.parser')
content = soup.select('#info span')

movie_info = {}

#获取详细字段
movie_info['director'] = soup.select('#info [rel="v:directedBy"]')[0].text #导演
movie_info['scriptwriter'] = content[5].text.strip().split('/') #编剧
movie_info_actor = soup.select('#info [rel="v:starring"]') #演员,多个演员,需获取列表
movie_info['actor']= []
for i in movie_info_actor:
movie_info['actor'].append(i.text)

movie_info_type = soup.select('#info [property="v:genre"]') #类型,可能属于多种类型
movie_info['type']= []
for i in movie_info_type:
movie_info['type'].append(i.text)

movie_info['runtime'] = soup.select('#info [property="v:runtime"]')[0].text #时长

movie_info['launch_time'] = soup.select('#info [property="v:initialReleaseDate"]')[0].text if soup.select('#info [property="v:initialReleaseDate"]') else '无' #上映日期

county_pattern = re.compile(r'<span class="pl">制片国家/地区:</span>(.*)<br/>') #国家
movie_info['country'] = re.findall(county_pattern,str(soup))[0].strip() #没有标签包围,用正则式提取

language_pattern = re.compile(r'<span class="pl">语言:</span>(.*)<br/>') #语言
movie_info['language'] = re.findall(language_pattern,str(soup))[0].strip() #没有标签包围,用正则式提取

movie_info['summary'] = soup.select('[property="v:summary"]')[0].text.strip() if soup.select('[property="v:summary"]') else '无' #简介

return movie_info

def merge_movie_info(df):

data = {} #每个电影的详细信息记录到字典里
data_list = [] #所有电影的详情存放到列表中
count = 1
for i in df['id']:
data = {}
url = 'https://movie.douban.com/subject/' + str(i)
info = get_movie_info(url)
data['id'] = i
data['director'] = info['director']
data['scriptwriter'] = info['scriptwriter']
data['actor'] = info['actor']
data['type'] = info['type']
data['runtime'] = info['runtime']
data['launch_time'] = info['launch_time']
data['country'] = info['country']
data['language'] = info['language']
data['summary'] = info['summary']
data_list.append(data)
print('第' + str(count) +'条数据已加载,已下载' + str(count * 100/len(df)) + '%的数据' + url)
count += 1

final_data = pd.merge(df,pd.DataFrame(data_list))
return final_data


df = get_movie_list(list_url) #获取豆瓣排名前200电影列表
info_data = merge_movie_info(df) #根据每个电影ID获取每个电影的详细信息
info_data_df = pd.DataFrame(info_data)
info_data_df.to_csv('douban_movie_top500.csv')
print('数据加载完成')