国产烂片深度揭秘

【项目11】 国产烂片深度揭秘

1、项目需求

1、看看电影评分分布及烂片情况
2、什么题材的电影烂片最多?
3、和什么国家合拍更可能产生烂片?
4、演员数量是否和烂片有关?
5、不同导演每年电影产量情况是如何的?

2、实现思路

1、读取数据通过直方图和箱型图查看分布情况,筛选top20的烂片
2、从箱型图中提取低于下分位数的影片做题材分类,获得类型内的烂片占比
3、筛选不同地区和国家数据,获得合拍片烂片占比
4、对演员数量进行分类,1-2人,3-4人,5-6人,7-9人,10人以上,查看分组后的烂片占比
5、清洗筛选不同导演07–17年的作品,去掉10部电影下以下导演,制作散点图分查看

3、实现步骤

1.1、导入模块,读取数据

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from pylab import mpl
plt.style.use('classic')
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False 
from bokeh.plotting import figure,show,output_file
output_file('项目11.html')
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
import os

os.chdir('C:\\Users\\Administrator\\Desktop\\项目资料\\项目11国产烂片深度揭秘')
df1 = pd.read_excel('moviedata.xlsx')

1.2、清洗豆瓣评分大于0的数据绘制直方图和箱型图查看数据

df1_dy = df1[['电影名称','豆瓣评分','主演','导演','类型']]
df1_dy = df1_dy[df1_dy['豆瓣评分']>0]
plt.figure()
df1_dy['豆瓣评分'].plot.hist(figsize = (10,4),
      bins = 50,edgecolor = 'k',grid = True,
      color = 'g',alpha = 0.7,title = '豆瓣评分分布')
plt.figure()
df1_dy['豆瓣评分'].plot.box(figsize = (10,4),
      vert = False,grid = True,title = '豆瓣评分分布')

1.3、筛选top20烂片

from scipy import stats
u = df1_dy['豆瓣评分'].mean()
std = df1_dy['豆瓣评分'].std()
print(stats.kstest(df1_dy['豆瓣评分'],'norm',(u,std)))#检验分布
df1_jg = df1_dy[df1_dy['豆瓣评分']<4.3]
df1_jg = df1_jg.sort_values('豆瓣评分',ascending = True)[:20]
df1_jg.index = range(20)

2.1、清洗筛选不同类型影片的烂片占比

df2_lx = []
for i in df1[df1['类型'].notnull()]['类型']:
    lxlist = i.replace(' ','').split('/')
    df2_lx.extend(lxlist)
    
df2_lx = list(set(df2_lx))    
df2 = df1_dy[df1_dy['类型'].notnull()][['电影名称','豆瓣评分','类型']]

def f1(data,j):
    df2_lp = {}
    dylx = data[data['类型'].str.contains(j)]
    lp = dylx[dylx['豆瓣评分']<4.3]
    lpzb = len(lp)/len(dylx)
    df2_lp['dylxmc'] = j
    df2_lp['lpzb'] = lpzb
    df2_lp['lxzs'] = len(dylx)
    return(df2_lp)

list_dy = []
for i in df2_lx:
    dici = f1(df2,i)
    list_dy.append(dici)


df2_lp_top20 = pd.DataFrame(list_dy).sort_values('lpzb',ascending = False)[:20]
df2_lp_top20['size'] = df2_lp_top20['lxzs']**0.5*2

2.2、绘制散点图查看烂片占比情况

source = ColumnDataSource(data = df2_lp_top20)
name1 = df2_lp_top20['dylxmc'].tolist()

hover1 = HoverTool(tooltips=[("数据量", "@lxzs"),
                           ("烂片比例","@lpzb")])
p1 = figure(x_range = name1,plot_width = 900, 
            plot_height = 400,title = '不同烂片比例情况',
            tools=[hover1,'reset,xwheel_zoom,pan,crosshair,box_select'])

p1.circle(x = 'dylxmc',y = 'lpzb',size = 'size',source = source,
          fill_color = 'red',line_color = 'black',line_dash = [8,8],
          fill_alpha = 0.5)

p1.xgrid.grid_line_dash = [10,4]
p1.ygrid.grid_line_dash = [10,4]
show(p1)

国产烂片深度揭秘

3.1、清洗筛选不同国家合拍数据,筛选合作超过3部以上电影的国家

df3 = df1[['电影名称','豆瓣评分','制片国家/地区']]
df3 = df3[df3['制片国家/地区'].notnull()]
df3 = df3[df3['制片国家/地区'].str.contains('中国大陆')]

gjlist = []
for i in df3['制片国家/地区'].str.replace(' ','').str.split('/'):
    gjlist.extend(i)
gjlist = list(set(gjlist))
gjlist.remove('中国大陆')
gjlist.remove('香港')
gjlist.remove('台湾')
gjlist.remove('中国')

df3_list = []

df3_notnull = df3[df3['豆瓣评分'].notnull()]
def f3(data,loci):   
    df3_dict = {}
    gj_dy = data[data['制片国家/地区'].str.contains(loci)]
    gj_lpzb = len(gj_dy[gj_dy['豆瓣评分']<4.3])/len(gj_dy)
    gj_count = len(gj_dy)
    df3_dict['gj'] = loci
    df3_dict['gj_lpzb'] = gj_lpzb
    df3_dict['gj_count'] = gj_count 
    return(df3_dict)

for i in gjlist:
    dicti = f3(df3_notnull,i)
    df3_list.append(dicti)
    
df3_jg = pd.DataFrame(df3_list)
df3_jg = df3_jg[df3_jg['gj_count']>=3].sort_values('gj_lpzb',ascending = False)

国产烂片深度揭秘
4.1、对演员数量进行分组,查看分组烂片占比

df1['主演人数'] = df1['主演'].str.split('/').str.len()
df1['人数分类'] = pd.cut(df1['主演人数'],[0,2,4,6,9,100],labels = ['1-2人',
   '3-4人','5-6人','7-9人','10人及以上'])
df4 = df1[['电影名称','豆瓣评分','主演','主演人数','人数分类']]   
df4 = df4[df4['主演人数'].notnull()]
df4 = df4[df4['豆瓣评分'].notnull()]
df4['是否烂片'] = df4['豆瓣评分']<4.3
df4_gb = df4.groupby(['人数分类','是否烂片']).count()
df4_gb.reset_index(inplace = True)
df4_lp = df4_gb[df4_gb['是否烂片']== True]
df4_flp = df4_gb[df4_gb['是否烂片']== False]
df4_jg1 = df4_lp.merge(df4_flp,left_on = '人数分类',right_on = '人数分类')
df4_jg1 = df4_jg1[['人数分类','电影名称_x','电影名称_y']]
df4_jg1['电影总数'] = df4_jg1['电影名称_x']+df4_jg1['电影名称_y']
df4_jg1['烂片占比'] =  df4_jg1['电影名称_x']/df4_jg1['电影总数']
del df4_jg1['电影名称_y']
df4_jg1.columns = ['人数分类','烂片数量','电影总数','烂片占比']

国产烂片深度揭秘
4.2、查看烂片占比最高的演员top20及个别演员的影片情况

df4_zylp = df1[(df1['豆瓣评分']<4.3)&(df1['主演'].notnull())]
df4_dyzs = df1[df1['主演'].notnull()]
zy_list = []
zy = df4_dyzs['主演'].str.replace(' ','').str.split('/')
for i in zy:
    zy_list.extend(i)
zy_list = list(set(zy_list))


df4_jgdict = []
for i in zy_list:
    j = df4_dyzs[df4_dyzs['主演'].str.contains(i)]
    if len(j) > 2:
        df4_dict = {}
        zy_dyzl = len(j)
        zy_lpzl = len(j[j['豆瓣评分']<4.3])
        zy_lpzb = zy_lpzl/zy_dyzl
        df4_dict['zy'] = i
        df4_dict['zy_dyzl'] = zy_dyzl
        df4_dict['zy_lpzb'] = zy_lpzb
        df4_jgdict.append(df4_dict)

df4_jg = pd.DataFrame(df4_jgdict)
df4_jg = df4_jg.sort_values('zy_lpzb',ascending = False)[:20]
print(df4_dyzs[df4_dyzs['主演'].str.contains('吴亦凡')])

吴亦凡的影片情况
国产烂片深度揭秘

5.1、通过导演字段筛选不同导演的影片,选取07-17年的影片,剔除影片小于10部的导演

df5_dylist = []
df5_dy = df5['导演'].str.replace(' ','').str.split('/')
for i in df5_dy:
    df5_dylist.extend(i)
df5_dymc = list(set(df5_dylist))


df5_re = df5[df5['豆瓣评分'].notnull()]
df5_re = df5[df5['导演'].notnull()]

df5_list = []
for i in df5_dymc:
    if len(df5_re[df5_re['导演'].str.contains(i)])>10:
        df5_dict = {}
        df5_dyzl = df5_re[df5_re['导演'].str.contains(i)]
        df5_lpzl = df5_dyzl[df5_dyzl['豆瓣评分']<4.3]
        df5_lpbl = len(df5_lpzl)/len(df5_dyzl)
        df5_zdzl = len(df5_dyzl)
        df5_dict['dymc'] = i
        df5_dict['lpbl'] = df5_lpbl
        df5_dict['dyzl'] = df5_zdzl
        df5_list.append(df5_dict)
df5_jg1 = pd.DataFrame(df5_list)


def f5(data,i):
    df5_i = data[data['导演'].str.contains(i)]
    df5_gb1 = df5_i.groupby('year').mean()
    df5_gb2 = df5_i[['电影名称','year']].groupby('year').count()
    df5_jg2 = df5_gb1.merge(df5_gb2,left_index = True,right_index = True)  
    df5_jg2.columns = ['dbpf','dysl']
    df5_jg2['size'] = df5_jg2['dysl']*5

    return(df5_jg2)
df5_wj = f5(df5_re,'王晶')
df5_zw = f5(df5_re,'周伟')
df5_xk = f5(df5_re,'徐克')
df5_lxc = f5(df5_re,'邓衍成')

5.2、绘制散点图查看不同年份评分情况

from bokeh.models.annotations import BoxAnnotation   


hover = HoverTool(tooltips=[("该年电影均分", "@dbpf"),
                           ("该年电影产量","@dysl")])
p5 = figure(plot_width=900, plot_height=500, title="不同导演每年的电影产量及电影均分", 
           tools=[hover,'reset,xwheel_zoom,pan,crosshair,box_select'])

source1 = ColumnDataSource(df5_wj)
p5.circle(x='year',y='dbpf',source = source1,size = 'size',legend="王晶",fill_color = 'olive',fill_alpha = 0.7,line_color = None)

source2 = ColumnDataSource(df5_zw)
p5.circle(x='year',y='dbpf',source = source2,size = 'size',legend="周伟",fill_color = 'blue',fill_alpha = 0.7,line_color = None)

source3 = ColumnDataSource(df5_xk)
p5.circle(x='year',y='dbpf',source = source3,size = 'size',legend="徐克",fill_color = 'green',fill_alpha = 0.7,line_color = None)

source4 = ColumnDataSource(df5_lxc)
p5.circle(x='year',y='dbpf',source = source4,size = 'size',legend="邓衍成",fill_color = 'gray',fill_alpha = 0.7,line_color = None)

bg = BoxAnnotation(top=4.4,fill_alpha=0.1, fill_color='red')
p5.add_layout(bg)


p5.xgrid.grid_line_dash = [10,4]
p5.ygrid.grid_line_dash = [10,4]
p5.legend.location = "top_right"

show(p5)

print('finished')

国产烂片深度揭秘