人工智能学习笔记——数据分析处理库Pandas

import pandas

课程所用数据包

读取csv数据:
food_info = pandas.read_csv("food_info.csv")
print(type(food_info))

print(food_info.dtypes)#字符串为object结构

<class 'pandas.core.frame.DataFrame'>
NDB_No               int64
Shrt_Desc           object
Water_(g)          float64
Energ_Kcal           int64
Protein_(g)        float64
Lipid_Tot_(g)      float64
Ash_(g)            float64
Carbohydrt_(g)     float64
Fiber_TD_(g)       float64
Sugar_Tot_(g)      float64
Calcium_(mg)       float64
Iron_(mg)          float64
Magnesium_(mg)     float64
Phosphorus_(mg)    float64
Potassium_(mg)     float64
Sodium_(mg)        float64
Zinc_(mg)          float64
Copper_(mg)        float64
Manganese_(mg)     float64
Selenium_(mcg)     float64
Vit_C_(mg)         float64
Thiamin_(mg)       float64
Riboflavin_(mg)    float64
Niacin_(mg)        float64
Vit_B6_(mg)        float64
Vit_B12_(mcg)      float64
Vit_A_IU           float64
Vit_A_RAE          float64
Vit_E_(mg)         float64
Vit_D_mcg          float64
Vit_D_IU           float64
Vit_K_(mcg)        float64
FA_Sat_(g)         float64
FA_Mono_(g)        float64
FA_Poly_(g)        float64
Cholestrl_(mg)     float64
dtype: object

显示数据内容:
food_info.head()
人工智能学习笔记——数据分析处理库Pandas

显示前n行数据:
food_info.head(n)
显示后n行:
food_info.tail(n)
显示列名:
food_info.columns

Index(['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)',
       'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)',
       'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)',
       'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)',
       'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)',
       'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)',
       'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg',
       'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)',
       'Cholestrl_(mg)'],
      dtype='object')

显示行列数:
food_info.shape

(8618, 36)
取数据:
按索引取数据:
food_info.loc[0]
NDB_No                         1001
Shrt_Desc          BUTTER WITH SALT
Water_(g)                     15.87
Energ_Kcal                      717
Protein_(g)                    0.85
Lipid_Tot_(g)                 81.11
Ash_(g)                        2.11
Carbohydrt_(g)                 0.06
Fiber_TD_(g)                      0
Sugar_Tot_(g)                  0.06
Calcium_(mg)                     24
Iron_(mg)                      0.02
Magnesium_(mg)                    2
Phosphorus_(mg)                  24
Potassium_(mg)                   24
Sodium_(mg)                     643
Zinc_(mg)                      0.09
Copper_(mg)                       0
Manganese_(mg)                    0
Selenium_(mcg)                    1
Vit_C_(mg)                        0
Thiamin_(mg)                  0.005
Riboflavin_(mg)               0.034
Niacin_(mg)                   0.042
Vit_B6_(mg)                   0.003
Vit_B12_(mcg)                  0.17
Vit_A_IU                       2499
Vit_A_RAE                       684
Vit_E_(mg)                     2.32
Vit_D_mcg                       1.5
Vit_D_IU                         60
Vit_K_(mcg)                       7
FA_Sat_(g)                   51.368
FA_Mono_(g)                  21.021
FA_Poly_(g)                   3.043
Cholestrl_(mg)                  215
Name: 0, dtype: object
切片:

food_info.loc[3:6]

定位具体值:
food_info.loc[3,"NDB_No"]

按照列名:
food_info["NDB_No"]
人工智能学习笔记——数据分析处理库Pandas
按照列名取多个数据:
columns = ["Zinc_(mg)","Copper_(mg)"]
food_info[columns]
人工智能学习笔记——数据分析处理库Pandas

取出以g结尾的数据:
col_names = food_info.columns.tolist()
print(col_names)
gram_columns = []
for c in col_names:
    if c.endswith("(g)"):
        gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))

['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)']
   Water_(g)  Protein_(g)  Lipid_Tot_(g)  Ash_(g)  Carbohydrt_(g)  \
0      15.87         0.85          81.11     2.11            0.06   
1      15.87         0.85          81.11     2.11            0.06   
2       0.24         0.28          99.48     0.00            0.00   

   Fiber_TD_(g)  Sugar_Tot_(g)  FA_Sat_(g)  FA_Mono_(g)  FA_Poly_(g)  
0           0.0           0.06      51.368       21.021        3.043  
1           0.0           0.06      50.489       23.426        3.012  
2           0.0           0.00      61.924       28.732        3.694  
组合运算:
result = food_info["Water_(g)"]*food_info["Energ_Kcal"]/1000
print(result.head(3))
0    11.37879
1    11.37879
2     0.21024
dtype: float64
添加数据:
food_info["Iron_(g)"] = result
print(food_info.shape)
(8618, 37)
排序:
food_info.sort_values("Sodium_(mg)",inplace=True,ascending=True)#ascending=True从小到大ascending=False从大到小;inplacenge=True不生成新数据
print(food_info["Sodium_(mg)"].head(100))
760     0.0
6470    0.0
654     0.0
655     0.0
2269    0.0
610     0.0
634     0.0
637     0.0
434     0.0
673     0.0
638     0.0
632     0.0
606     0.0
633     0.0
6463    0.0
630     0.0
判断数据是否非空:
import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
age = titanic_survival["Age"]
#print(age.loc[0:10])
age_is_null = pd.isnull(age)#NaN返回True
# print (age_is_null)
age_null_true = age[age_is_null]
# print (age_null_true)
age_null_count = len(age_null_true)
print(age_null_count)

求均值:
correct_mean_age = titanic_survival["Age"].mean()
print (correct_mean_age)

29.6h9911764705882

条件操作:
不同等级客舱获救平均人数:

passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
#index="Pclass"按照客舱等级,values="Survived"对获救人数进行操作, aggfunc=np.mean进行求平均
print (passenger_survival)

        Survived
Pclass          
1       0.629630
2       0.472826
3       0.242363
不同上船地点的船票总费用和获救总人数:
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
print(port_stats)
                Fare  Survived
Embarked                      
C         10072.2962        93
Q          1022.2543        30
S         17439.3988       217
丢弃缺失值:
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])

重新排序:
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print (new_titanic_survival[0:5])
titanic_reindexed = new_titanic_survival.reset_index(drop=True)
print('-------------')
print(titanic_reindexed.loc[0:5])

     PassengerId  Survived  Pclass                                  Name  \
630          631         1       1  Barkworth, Mr. Algernon Henry Wilson   
851          852         0       3                   Svensson, Mr. Johan   
493          494         0       1               Artagaveytia, Mr. Ramon   
96            97         0       1             Goldschmidt, Mr. George B   
116          117         0       3                  Connors, Mr. Patrick   

      Sex   Age  SibSp  Parch    Ticket     Fare Cabin Embarked  
630  male  80.0      0      0     27042  30.0000   A23        S  
851  male  74.0      0      0    347060   7.7750   NaN        S  
493  male  71.0      0      0  PC 17609  49.5042   NaN        C  
96   male  71.0      0      0  PC 17754  34.6542    A5        C  
116  male  70.5      0      0    370369   7.7500   NaN        Q  
-------------
   PassengerId  Survived  Pclass                                  Name   Sex  \
0          631         1       1  Barkworth, Mr. Algernon Henry Wilson  male   
1          852         0       3                   Svensson, Mr. Johan  male   
2          494         0       1               Artagaveytia, Mr. Ramon  male   
3           97         0       1             Goldschmidt, Mr. George B  male   
4          117         0       3                  Connors, Mr. Patrick  male   
5          673         0       2           Mitchell, Mr. Henry Michael  male   

    Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
0  80.0      0      0       27042  30.0000   A23        S  
1  74.0      0      0      347060   7.7750   NaN        S  
2  71.0      0      0    PC 17609  49.5042   NaN        C  
3  71.0      0      0    PC 17754  34.6542    A5        C  
4  70.5      0      0      370369   7.7500   NaN        Q  
5  70.0      0      0  C.A. 24580  10.5000   NaN        S  
使用自定义函数:

返回第一百行数据:
def hundredth_row(column):

    # Extract the hundredth item
    hundredth_item = column.loc[99]
    return hundredth_item
hundredth_row = titanic_survival.apply(hundredth_row)
print (hundredth_row)
PassengerId                  100
Survived                       0
Pclass                         2
Name           Kantor, Mr. Sinai
Sex                         male
Age                           34
SibSp                          1
Parch                          0
Ticket                    244367
Fare                          26
Cabin                        NaN
Embarked                       S
dtype: object
返回每一列的空数据个数:
def not_null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)
column_null_count = titanic_survival.apply(not_null_count)
print (column_null_count)
PassengerId                  100
Survived                       0
Pclass                         2
Name           Kantor, Mr. Sinai
Sex                         male
Age                           34
SibSp                          1
Parch                          0
Ticket                    244367
Fare                          26
Cabin                        NaN
Embarked                       S
dtype: object

判断是否成年(离散化操作):
def is_minor(row):
    if row["Age"] < 18:
        return True
    else:

        return False


minors = titanic_survival.apply(is_minor, axis=1)

#print minors


def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"


age_labels = titanic_survival.apply(generate_age_label, axis=1)
print (age_labels)
0        adult
1        adult
2        adult
3        adult
4        adult
5      unknown
6        adult
7        minor
8        adult
9        minor
10       minor
11       adult
12       adult
13       adult
14       minor
15       adult
16       minor
17     unknown
18       adult
19     unknown
20       adult
21       adult
22       minor
23       adult
24       minor
25       adult
复合操作:
titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
print (age_group_survival)
age_labels
adult      0.381032
minor      0.539823
unknown    0.293785
Name: Survived, dtype: float64