人工智能学习笔记——数据分析处理库Pandas
import pandas
读取csv数据:
food_info = pandas.read_csv("food_info.csv")
print(type(food_info))
print(food_info.dtypes)#字符串为object结构
<class 'pandas.core.frame.DataFrame'> NDB_No int64 Shrt_Desc object Water_(g) float64 Energ_Kcal int64 Protein_(g) float64 Lipid_Tot_(g) float64 Ash_(g) float64 Carbohydrt_(g) float64 Fiber_TD_(g) float64 Sugar_Tot_(g) float64 Calcium_(mg) float64 Iron_(mg) float64 Magnesium_(mg) float64 Phosphorus_(mg) float64 Potassium_(mg) float64 Sodium_(mg) float64 Zinc_(mg) float64 Copper_(mg) float64 Manganese_(mg) float64 Selenium_(mcg) float64 Vit_C_(mg) float64 Thiamin_(mg) float64 Riboflavin_(mg) float64 Niacin_(mg) float64 Vit_B6_(mg) float64 Vit_B12_(mcg) float64 Vit_A_IU float64 Vit_A_RAE float64 Vit_E_(mg) float64 Vit_D_mcg float64 Vit_D_IU float64 Vit_K_(mcg) float64 FA_Sat_(g) float64 FA_Mono_(g) float64 FA_Poly_(g) float64 Cholestrl_(mg) float64 dtype: object
显示数据内容:
food_info.head()
显示前n行数据:
food_info.head(n)
显示后n行:
food_info.tail(n)
显示列名:
food_info.columns
Index(['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)'], dtype='object')
显示行列数:
food_info.shape
(8618, 36)取数据:
按索引取数据:
food_info.loc[0]
NDB_No 1001
Shrt_Desc BUTTER WITH SALT
Water_(g) 15.87
Energ_Kcal 717
Protein_(g) 0.85
Lipid_Tot_(g) 81.11
Ash_(g) 2.11
Carbohydrt_(g) 0.06
Fiber_TD_(g) 0
Sugar_Tot_(g) 0.06
Calcium_(mg) 24
Iron_(mg) 0.02
Magnesium_(mg) 2
Phosphorus_(mg) 24
Potassium_(mg) 24
Sodium_(mg) 643
Zinc_(mg) 0.09
Copper_(mg) 0
Manganese_(mg) 0
Selenium_(mcg) 1
Vit_C_(mg) 0
Thiamin_(mg) 0.005
Riboflavin_(mg) 0.034
Niacin_(mg) 0.042
Vit_B6_(mg) 0.003
Vit_B12_(mcg) 0.17
Vit_A_IU 2499
Vit_A_RAE 684
Vit_E_(mg) 2.32
Vit_D_mcg 1.5
Vit_D_IU 60
Vit_K_(mcg) 7
FA_Sat_(g) 51.368
FA_Mono_(g) 21.021
FA_Poly_(g) 3.043
Cholestrl_(mg) 215
Name: 0, dtype: object
切片:food_info.loc[3:6]
定位具体值:
food_info.loc[3,"NDB_No"]
food_info["NDB_No"]
按照列名取多个数据:
columns = ["Zinc_(mg)","Copper_(mg)"]
food_info[columns]
取出以g结尾的数据:
col_names = food_info.columns.tolist()
print(col_names)
gram_columns = []
for c in col_names:
if c.endswith("(g)"):
gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))
['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)'] Water_(g) Protein_(g) Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) \ 0 15.87 0.85 81.11 2.11 0.06 1 15.87 0.85 81.11 2.11 0.06 2 0.24 0.28 99.48 0.00 0.00 Fiber_TD_(g) Sugar_Tot_(g) FA_Sat_(g) FA_Mono_(g) FA_Poly_(g) 0 0.0 0.06 51.368 21.021 3.043 1 0.0 0.06 50.489 23.426 3.012 2 0.0 0.00 61.924 28.732 3.694组合运算:
result = food_info["Water_(g)"]*food_info["Energ_Kcal"]/1000
print(result.head(3))
0 11.37879 1 11.37879 2 0.21024 dtype: float64添加数据:
food_info["Iron_(g)"] = result
print(food_info.shape)
(8618, 37)排序:
food_info.sort_values("Sodium_(mg)",inplace=True,ascending=True)#ascending=True从小到大ascending=False从大到小;inplacenge=True不生成新数据
print(food_info["Sodium_(mg)"].head(100))
760 0.0 6470 0.0 654 0.0 655 0.0 2269 0.0 610 0.0 634 0.0 637 0.0 434 0.0 673 0.0 638 0.0 632 0.0 606 0.0 633 0.0 6463 0.0 630 0.0判断数据是否非空:
import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
age = titanic_survival["Age"]
#print(age.loc[0:10])
age_is_null = pd.isnull(age)#NaN返回True
# print (age_is_null)
age_null_true = age[age_is_null]
# print (age_null_true)
age_null_count = len(age_null_true)
print(age_null_count)
求均值:
correct_mean_age = titanic_survival["Age"].mean()
print (correct_mean_age)
29.6h9911764705882
条件操作:
不同等级客舱获救平均人数:
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
#index="Pclass"按照客舱等级,values="Survived"对获救人数进行操作, aggfunc=np.mean进行求平均
print (passenger_survival)
Survived Pclass 1 0.629630 2 0.472826 3 0.242363不同上船地点的船票总费用和获救总人数:
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
print(port_stats)
Fare Survived Embarked C 10072.2962 93 Q 1022.2543 30 S 17439.3988 217丢弃缺失值:
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])
重新排序:
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print (new_titanic_survival[0:5])
titanic_reindexed = new_titanic_survival.reset_index(drop=True)
print('-------------')
print(titanic_reindexed.loc[0:5])
PassengerId Survived Pclass Name \ 630 631 1 1 Barkworth, Mr. Algernon Henry Wilson 851 852 0 3 Svensson, Mr. Johan 493 494 0 1 Artagaveytia, Mr. Ramon 96 97 0 1 Goldschmidt, Mr. George B 116 117 0 3 Connors, Mr. Patrick Sex Age SibSp Parch Ticket Fare Cabin Embarked 630 male 80.0 0 0 27042 30.0000 A23 S 851 male 74.0 0 0 347060 7.7750 NaN S 493 male 71.0 0 0 PC 17609 49.5042 NaN C 96 male 71.0 0 0 PC 17754 34.6542 A5 C 116 male 70.5 0 0 370369 7.7500 NaN Q ------------- PassengerId Survived Pclass Name Sex \ 0 631 1 1 Barkworth, Mr. Algernon Henry Wilson male 1 852 0 3 Svensson, Mr. Johan male 2 494 0 1 Artagaveytia, Mr. Ramon male 3 97 0 1 Goldschmidt, Mr. George B male 4 117 0 3 Connors, Mr. Patrick male 5 673 0 2 Mitchell, Mr. Henry Michael male Age SibSp Parch Ticket Fare Cabin Embarked 0 80.0 0 0 27042 30.0000 A23 S 1 74.0 0 0 347060 7.7750 NaN S 2 71.0 0 0 PC 17609 49.5042 NaN C 3 71.0 0 0 PC 17754 34.6542 A5 C 4 70.5 0 0 370369 7.7500 NaN Q 5 70.0 0 0 C.A. 24580 10.5000 NaN S使用自定义函数:
返回第一百行数据:
def hundredth_row(column):
hundredth_item = column.loc[99]
return hundredth_item
hundredth_row = titanic_survival.apply(hundredth_row)
print (hundredth_row)
PassengerId 100 Survived 0 Pclass 2 Name Kantor, Mr. Sinai Sex male Age 34 SibSp 1 Parch 0 Ticket 244367 Fare 26 Cabin NaN Embarked S dtype: object返回每一列的空数据个数:
def not_null_count(column):
column_null = pd.isnull(column)
null = column[column_null]
return len(null)
column_null_count = titanic_survival.apply(not_null_count)
print (column_null_count)
PassengerId 100 Survived 0 Pclass 2 Name Kantor, Mr. Sinai Sex male Age 34 SibSp 1 Parch 0 Ticket 244367 Fare 26 Cabin NaN Embarked S dtype: object
判断是否成年(离散化操作):
def is_minor(row):
if row["Age"] < 18:
return True
else:
return False
#print minors
age = row["Age"]
if pd.isnull(age):
return "unknown"
elif age < 18:
return "minor"
else:
return "adult"
age_labels = titanic_survival.apply(generate_age_label, axis=1)
print (age_labels)
0 adult 1 adult 2 adult 3 adult 4 adult 5 unknown 6 adult 7 minor 8 adult 9 minor 10 minor 11 adult 12 adult 13 adult 14 minor 15 adult 16 minor 17 unknown 18 adult 19 unknown 20 adult 21 adult 22 minor 23 adult 24 minor 25 adult复合操作:
titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
print (age_group_survival)
age_labels adult 0.381032 minor 0.539823 unknown 0.293785 Name: Survived, dtype: float64