国产烂片深度揭秘
【项目11】 国产烂片深度揭秘
1、项目需求
1、看看电影评分分布及烂片情况
2、什么题材的电影烂片最多?
3、和什么国家合拍更可能产生烂片?
4、演员数量是否和烂片有关?
5、不同导演每年电影产量情况是如何的?
2、实现思路
1、读取数据通过直方图和箱型图查看分布情况,筛选top20的烂片
2、从箱型图中提取低于下分位数的影片做题材分类,获得类型内的烂片占比
3、筛选不同地区和国家数据,获得合拍片烂片占比
4、对演员数量进行分类,1-2人,3-4人,5-6人,7-9人,10人以上,查看分组后的烂片占比
5、清洗筛选不同导演07–17年的作品,去掉10部电影下以下导演,制作散点图分查看
3、实现步骤
1.1、导入模块,读取数据
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from pylab import mpl
plt.style.use('classic')
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False
from bokeh.plotting import figure,show,output_file
output_file('项目11.html')
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
import os
os.chdir('C:\\Users\\Administrator\\Desktop\\项目资料\\项目11国产烂片深度揭秘')
df1 = pd.read_excel('moviedata.xlsx')
1.2、清洗豆瓣评分大于0的数据绘制直方图和箱型图查看数据
df1_dy = df1[['电影名称','豆瓣评分','主演','导演','类型']]
df1_dy = df1_dy[df1_dy['豆瓣评分']>0]
plt.figure()
df1_dy['豆瓣评分'].plot.hist(figsize = (10,4),
bins = 50,edgecolor = 'k',grid = True,
color = 'g',alpha = 0.7,title = '豆瓣评分分布')
plt.figure()
df1_dy['豆瓣评分'].plot.box(figsize = (10,4),
vert = False,grid = True,title = '豆瓣评分分布')
1.3、筛选top20烂片
from scipy import stats
u = df1_dy['豆瓣评分'].mean()
std = df1_dy['豆瓣评分'].std()
print(stats.kstest(df1_dy['豆瓣评分'],'norm',(u,std)))#检验分布
df1_jg = df1_dy[df1_dy['豆瓣评分']<4.3]
df1_jg = df1_jg.sort_values('豆瓣评分',ascending = True)[:20]
df1_jg.index = range(20)
2.1、清洗筛选不同类型影片的烂片占比
df2_lx = []
for i in df1[df1['类型'].notnull()]['类型']:
lxlist = i.replace(' ','').split('/')
df2_lx.extend(lxlist)
df2_lx = list(set(df2_lx))
df2 = df1_dy[df1_dy['类型'].notnull()][['电影名称','豆瓣评分','类型']]
def f1(data,j):
df2_lp = {}
dylx = data[data['类型'].str.contains(j)]
lp = dylx[dylx['豆瓣评分']<4.3]
lpzb = len(lp)/len(dylx)
df2_lp['dylxmc'] = j
df2_lp['lpzb'] = lpzb
df2_lp['lxzs'] = len(dylx)
return(df2_lp)
list_dy = []
for i in df2_lx:
dici = f1(df2,i)
list_dy.append(dici)
df2_lp_top20 = pd.DataFrame(list_dy).sort_values('lpzb',ascending = False)[:20]
df2_lp_top20['size'] = df2_lp_top20['lxzs']**0.5*2
2.2、绘制散点图查看烂片占比情况
source = ColumnDataSource(data = df2_lp_top20)
name1 = df2_lp_top20['dylxmc'].tolist()
hover1 = HoverTool(tooltips=[("数据量", "@lxzs"),
("烂片比例","@lpzb")])
p1 = figure(x_range = name1,plot_width = 900,
plot_height = 400,title = '不同烂片比例情况',
tools=[hover1,'reset,xwheel_zoom,pan,crosshair,box_select'])
p1.circle(x = 'dylxmc',y = 'lpzb',size = 'size',source = source,
fill_color = 'red',line_color = 'black',line_dash = [8,8],
fill_alpha = 0.5)
p1.xgrid.grid_line_dash = [10,4]
p1.ygrid.grid_line_dash = [10,4]
show(p1)
3.1、清洗筛选不同国家合拍数据,筛选合作超过3部以上电影的国家
df3 = df1[['电影名称','豆瓣评分','制片国家/地区']]
df3 = df3[df3['制片国家/地区'].notnull()]
df3 = df3[df3['制片国家/地区'].str.contains('中国大陆')]
gjlist = []
for i in df3['制片国家/地区'].str.replace(' ','').str.split('/'):
gjlist.extend(i)
gjlist = list(set(gjlist))
gjlist.remove('中国大陆')
gjlist.remove('香港')
gjlist.remove('台湾')
gjlist.remove('中国')
df3_list = []
df3_notnull = df3[df3['豆瓣评分'].notnull()]
def f3(data,loci):
df3_dict = {}
gj_dy = data[data['制片国家/地区'].str.contains(loci)]
gj_lpzb = len(gj_dy[gj_dy['豆瓣评分']<4.3])/len(gj_dy)
gj_count = len(gj_dy)
df3_dict['gj'] = loci
df3_dict['gj_lpzb'] = gj_lpzb
df3_dict['gj_count'] = gj_count
return(df3_dict)
for i in gjlist:
dicti = f3(df3_notnull,i)
df3_list.append(dicti)
df3_jg = pd.DataFrame(df3_list)
df3_jg = df3_jg[df3_jg['gj_count']>=3].sort_values('gj_lpzb',ascending = False)
4.1、对演员数量进行分组,查看分组烂片占比
df1['主演人数'] = df1['主演'].str.split('/').str.len()
df1['人数分类'] = pd.cut(df1['主演人数'],[0,2,4,6,9,100],labels = ['1-2人',
'3-4人','5-6人','7-9人','10人及以上'])
df4 = df1[['电影名称','豆瓣评分','主演','主演人数','人数分类']]
df4 = df4[df4['主演人数'].notnull()]
df4 = df4[df4['豆瓣评分'].notnull()]
df4['是否烂片'] = df4['豆瓣评分']<4.3
df4_gb = df4.groupby(['人数分类','是否烂片']).count()
df4_gb.reset_index(inplace = True)
df4_lp = df4_gb[df4_gb['是否烂片']== True]
df4_flp = df4_gb[df4_gb['是否烂片']== False]
df4_jg1 = df4_lp.merge(df4_flp,left_on = '人数分类',right_on = '人数分类')
df4_jg1 = df4_jg1[['人数分类','电影名称_x','电影名称_y']]
df4_jg1['电影总数'] = df4_jg1['电影名称_x']+df4_jg1['电影名称_y']
df4_jg1['烂片占比'] = df4_jg1['电影名称_x']/df4_jg1['电影总数']
del df4_jg1['电影名称_y']
df4_jg1.columns = ['人数分类','烂片数量','电影总数','烂片占比']
4.2、查看烂片占比最高的演员top20及个别演员的影片情况
df4_zylp = df1[(df1['豆瓣评分']<4.3)&(df1['主演'].notnull())]
df4_dyzs = df1[df1['主演'].notnull()]
zy_list = []
zy = df4_dyzs['主演'].str.replace(' ','').str.split('/')
for i in zy:
zy_list.extend(i)
zy_list = list(set(zy_list))
df4_jgdict = []
for i in zy_list:
j = df4_dyzs[df4_dyzs['主演'].str.contains(i)]
if len(j) > 2:
df4_dict = {}
zy_dyzl = len(j)
zy_lpzl = len(j[j['豆瓣评分']<4.3])
zy_lpzb = zy_lpzl/zy_dyzl
df4_dict['zy'] = i
df4_dict['zy_dyzl'] = zy_dyzl
df4_dict['zy_lpzb'] = zy_lpzb
df4_jgdict.append(df4_dict)
df4_jg = pd.DataFrame(df4_jgdict)
df4_jg = df4_jg.sort_values('zy_lpzb',ascending = False)[:20]
print(df4_dyzs[df4_dyzs['主演'].str.contains('吴亦凡')])
吴亦凡的影片情况
5.1、通过导演字段筛选不同导演的影片,选取07-17年的影片,剔除影片小于10部的导演
df5_dylist = []
df5_dy = df5['导演'].str.replace(' ','').str.split('/')
for i in df5_dy:
df5_dylist.extend(i)
df5_dymc = list(set(df5_dylist))
df5_re = df5[df5['豆瓣评分'].notnull()]
df5_re = df5[df5['导演'].notnull()]
df5_list = []
for i in df5_dymc:
if len(df5_re[df5_re['导演'].str.contains(i)])>10:
df5_dict = {}
df5_dyzl = df5_re[df5_re['导演'].str.contains(i)]
df5_lpzl = df5_dyzl[df5_dyzl['豆瓣评分']<4.3]
df5_lpbl = len(df5_lpzl)/len(df5_dyzl)
df5_zdzl = len(df5_dyzl)
df5_dict['dymc'] = i
df5_dict['lpbl'] = df5_lpbl
df5_dict['dyzl'] = df5_zdzl
df5_list.append(df5_dict)
df5_jg1 = pd.DataFrame(df5_list)
def f5(data,i):
df5_i = data[data['导演'].str.contains(i)]
df5_gb1 = df5_i.groupby('year').mean()
df5_gb2 = df5_i[['电影名称','year']].groupby('year').count()
df5_jg2 = df5_gb1.merge(df5_gb2,left_index = True,right_index = True)
df5_jg2.columns = ['dbpf','dysl']
df5_jg2['size'] = df5_jg2['dysl']*5
return(df5_jg2)
df5_wj = f5(df5_re,'王晶')
df5_zw = f5(df5_re,'周伟')
df5_xk = f5(df5_re,'徐克')
df5_lxc = f5(df5_re,'邓衍成')
5.2、绘制散点图查看不同年份评分情况
from bokeh.models.annotations import BoxAnnotation
hover = HoverTool(tooltips=[("该年电影均分", "@dbpf"),
("该年电影产量","@dysl")])
p5 = figure(plot_width=900, plot_height=500, title="不同导演每年的电影产量及电影均分",
tools=[hover,'reset,xwheel_zoom,pan,crosshair,box_select'])
source1 = ColumnDataSource(df5_wj)
p5.circle(x='year',y='dbpf',source = source1,size = 'size',legend="王晶",fill_color = 'olive',fill_alpha = 0.7,line_color = None)
source2 = ColumnDataSource(df5_zw)
p5.circle(x='year',y='dbpf',source = source2,size = 'size',legend="周伟",fill_color = 'blue',fill_alpha = 0.7,line_color = None)
source3 = ColumnDataSource(df5_xk)
p5.circle(x='year',y='dbpf',source = source3,size = 'size',legend="徐克",fill_color = 'green',fill_alpha = 0.7,line_color = None)
source4 = ColumnDataSource(df5_lxc)
p5.circle(x='year',y='dbpf',source = source4,size = 'size',legend="邓衍成",fill_color = 'gray',fill_alpha = 0.7,line_color = None)
bg = BoxAnnotation(top=4.4,fill_alpha=0.1, fill_color='red')
p5.add_layout(bg)
p5.xgrid.grid_line_dash = [10,4]
p5.ygrid.grid_line_dash = [10,4]
p5.legend.location = "top_right"
show(p5)
print('finished')