DataFrame和Series练习

主要练习DataFrame的

import pandas as pd
zhou=[[1,2,3,4],[5,6,7,8]]
df=pd.DataFrame(zhou,columns=['x1','x2','x3','x4'])
df1=df.copy()                 #拷贝一个DataFrame的副本

import os
import datetime
import numpy as np
from scipy import sparse
from scipy.stats import mstats
import pandas as pd
from pandas.api.types import is_numeric_dtype
import re
import pickle
import shutil

import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold

s=pd.Series(['-10.0121','+11.222','9.087'])
s = s.str.extract('([-+]?\d*\.\d+|\d+)', expand=False).astype(float).round(2)
    # deal with outliers
“”“
0   -10.01
1    11.22
2     9.09
dtype: float64
”“”
s = pd.Series(mstats.winsorize(s, limits=[0.01, 0.01]))    
    
    
tmp_s=pd.DataFrame([['row1','y1'],['row2','y1'],['row3','y2'],['row4','y1'],['row5','y1'],['row6','y2'],['row7','y1'],['row8','y2'],['row9','y3'],['row10','y2']],columns=['feature','label'])
tmp_counts = tmp_s['label'].value_counts() #对训练数据的标签进行计数
“”“
Out[81]: 
y1    5
y2    4
y3    1
Name: label, dtype: int64
”“”
num_top=2
tmp_counts_top = tmp_counts[:num_top]  #筛选(过滤):选出Series中最高的几个
tmp_num_unique = np.sum(tmp_counts>4)  #筛选出次数异常的(大于某个阈值)
tmp_counts_num = pd.to_numeric(tmp_counts_top.index.to_series(), errors='coerce')
tmp_counts_num_extract = tmp_counts_top.index.to_series().str.extract('([-+]?\d*\.\d+|\d+)', expand=False).astype(float).round(2)    #链式法则调用
#y1    1.0
#y2    2.0

import numpy as np    
df=pd.DataFrame([[1,2,3,4,5],[6,7,8,9,10],[11,12,np.nan,14,15]],columns=['0120', '0125', '0981', '0984', '0983'])
np.sum(df[['0120', '0125', '0981', '0984', '0983']].notnull(), axis=1) #统计每一行的空值数目
#0    5
#1    5
#2    4
np.sum(df['0981'].notnull()) #统计列的空值数目
df['0981'].name              #取出某列(Series)的名字


keywords_ch = ['糖尿病', '高血压', '血脂', '治疗中', '肥胖', '血糖', '血压高', '血脂偏高', '血压高偏高', '冠心病',
                  '脂肪肝', '不齐', '过缓', '血管弹性', '脂',
                  '硬化', '舒张期杂音', '收缩期杂音', '低盐', '低脂']
keywords_en = ['disease_'+str(i) for i in range(len(keywords_ch))]


#1.apply()
#2.applymap()
#3.map()

#1.apply()
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
f = lambda x: x.max() - x.min()
frame.apply(f,axis=1) #让方程作用在一维的向量(行向量axis=0,or列向量axis=1)上

#2.applymap()
format = lambda x: '%.2f' % x
frame.applymap(format)  #让方程作用于DataFrame中的每一个元素

#3.map()
frame['e'].map(format)  #将函数作用于一个Series的每一个元素
"""
df_to_merge['if_subhealth'] = (df['2302'] == '亚健康').astype(int)  #直接将bool值转为类别值(强制类型转换)
df_to_merge['if_ill'] = (df['2302'] == '疾病').astype(int)
df_to_merge['if_health'] = (~(df_to_merge['if_subhealth']&df_to_merge['if_ill'])).astype(int)  #只有三类,第三类=总体-一二类即可。

df_to_merge['if_jianman'] = df['1402'].apply(lambda x: '减慢' in str(x)).astype(int)  #对DataFrame的一列做转换:bool->数值。
df_to_merge['if_zengkuai'] = df['1402'].apply(lambda x: '增快' in str(x)).astype(int)
df_to_merge['if_jiangdi'] = df['1402'].apply(lambda x: '降低' in str(x)).astype(int)
"""
data11=[['x'+str(i) for i in range(1,41)],
        ['检出糖尿病II级','正常','正常','未检查','健康','无','未见','冠心病1',
         '高血压', '血脂', '脂肪肝阳性', '慢性胃炎阳性', '阑尾炎阳性', '甲肝', '肾结石',
         '胆囊切除', '甲肝', '冠心病', '胆结石', '甲状腺', '脑梗塞', '胆囊炎', '脑溢血', 
         '早搏', '杂音', '心动过缓', '心律不齐', '心动过速','正常','正常','未检查','健康','无','未见',
         '正常','正常','未检查','健康','无','未见'],
        ['检出糖尿病II级','正常','正常','未检查','健康','无','未见','冠心病1',
         '高血压', '血脂', '正常','未检查','健康','无','未见',
         '胆囊切除', '甲肝', '冠心病阳性', '胆结石阳性', '甲状腺阳性', '脑梗塞阳性', '胆囊炎', '脑溢血', 
         '早搏', '杂音', '查见心动过缓', '查见心律不齐', '心动过速','正常','正常','未检查','健康','无','未见',
         '正常','正常','未检查','健康','无','未见']
]

s=pd.DataFrame(data11).T
s.columns=['vid','0434', '0409']
keywords_ch = ['糖尿病', '高血压', '血脂', '脂肪肝', '慢性胃炎', '阑尾炎', '甲肝', '肾结石',
                   '胆囊切除', '甲肝', '冠心病', '胆结石', '甲状腺', '脑梗塞', '胆囊炎', '脑溢血', 
                  '早搏', '杂音', '心动过缓', '心律不齐', '心动过速']
len(keywords_ch)
keywords_en = ['disease_'+str(i) for i in range(len(keywords_ch))]
dict_out = {}
#将原始文本中的两列转为独热编码的len(keywords_ch)列。
for i in s.columns:
    sname=i
    if sname in ['0434', '0409']:
        for i, kw in enumerate(keywords_en):
            s_out = pd.Series([np.nan]*len(s))
            s_out[s[sname].str.contains('{}|阳性|查见|检到|检出'.format(keywords_ch[i]), na=False)] = 1
            s_out[s[sname].str.contains('无|未查见|健康|未见', na=False)] = 0
            if np.sum(s_out) > 2: #太少了就忽略不计数了。
                #dict_out[sname+'_'+kw] = s_out
                dict_out[kw] = s_out
df_to_concat = pd.DataFrame(dict_out)

X_test_df=pd.DataFrame([[1,2,3,4],[5,6,7,8]],columns=['vid','x1','x2','x3'])
X_test_df.drop('vid',axis=1)  #删除特征矩阵部分的vid那一列,这是最简单的方法。亦可以直接筛选想要的所有特征列名作为list传入DataFrame中。
# x1  x2  x3
#0   2   3   4
#1   6   7   8

X_test_df.drop('vid',axis=1).values
#array([[2, 3, 4],
#       [6, 7, 8]], dtype=int64)

def log1p_mse(preds, train_data):
    labels = train_data.get_label()
    result = np.mean((np.log1p(preds) - np.log1p(labels))**2)
    return 'error', result, False


# 生成四类特征:1:纯文本2:纯数值;3:文本+数值;4:categorical
X_processed = pd.DataFrame([['A0001',1,2,3],['A0002',4,5,6],['B0003',7,8,9],['B0004',11,12,13]],columns=['vid','t1','t2','t3'])
# 临时加一波特征
X_to_merge1 = pd.DataFrame([['A0001',101,102,102],['A0002',104,105,106],['B0003',107,108,109],['B0004',111,112,113]],columns=['vid','f1','f2','f3'])
X_processed = X_processed.merge(X_to_merge1, how='left', on='vid')
# 再加一波特征
X_to_merge2 = pd.DataFrame([['A0001',1,1,0],['A0002',0,0,1],['B0003',1,0,0],['B0004',0,1,0]],columns=['vid','s1','s2','s3'])
X_processed = X_processed.merge(X_to_merge2, how='left', on='vid')
# 第三波特征
X_to_merge3 =  pd.DataFrame([['A0001','f'],['A0002','m'],['B0003','m'],['B0004','f']],columns=['vid','sex'])
X_processed = X_processed.merge(X_to_merge3, how='left', on='vid')
# 第四波特征
X_to_merge3 =  pd.DataFrame([['A0001','Shanghai'],['A0002','Beijing'],['B0003','New York'],['B0004','Tokyo']],columns=['vid','address'])
X_processed = X_processed.merge(X_to_merge3, how='left', on='vid')


DataFrame和Series练习DataFrame和Series练习