Pandas crosstab()与pivot_table()和groupby()的比较

发布时间:2021-12-03 公开文章
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei'] # 设置字体为黑体,解决Matplotlib中文乱码问题
plt.rcParams['axes.unicode_minus']=False # 解决Matplotlib坐标轴负号'-'显示为方块的问题
%matplotlib inline
# 忽略警告
import warnings
warnings.filterwarnings('ignore')

crosstab()与pivot_table()和groupby()的比较

#从外部导入数据
diamonds = pd.read_csv('./data/diamonds.csv')
diamonds.head()

diamonds.info()
diamonds.describe()

pd.crosstab(index=diamonds['cut'], columns=diamonds['color'])  # 对color进行聚类count

pd.crosstab(index=diamonds['cut'],
            columns=diamonds['color'],
            values=diamonds['price'],
            aggfunc=np.mean).round(0)  # 对价格进行聚类,并求price均值

使用 groupby()

diamonds.groupby(['cut', 'color'])['price'].mean().round(0)
 

pivot_table()

diamonds.pivot_table(values='price',
                     index='cut',
                     columns='color',
                     aggfunc=np.mean).round(0)

crosstab()

pd.crosstab(index=diamonds['cut'],
            columns=diamonds['color'],
            values=diamonds['price'],
            aggfunc=np.mean).round(0)

# crosstab()的另外两个有用参数是margins和margins_name(两者都存在于pivot_table()中)。设置为True时,边界计算每行和每列的和。
pd.crosstab(index=diamonds['cut'], 
            columns=diamonds['clarity'],  
            margins=True)

pd.crosstab(index=diamonds['cut'],
            columns=diamonds['clarity'],
            margins=True,
            normalize=True)  # normalize 标准化

pd.crosstab(index=[diamonds['cut'], diamonds['clarity']],
            columns=diamonds['color'])  # 复合序列

unstack() 与stack()

df1 = diamonds.groupby(['cut', 'color'])['price'].mean().round(0)
df1

set_index和reset_index

df1
cut        color
Fair       D        4291.0
           E        3682.0
           F        3827.0
           G        4239.0
           H        5136.0
           I        4685.0
           J        4976.0
Good       D        3405.0
           E        3424.0
           F        3496.0
           G        4123.0
           H        4276.0
           I        5079.0
           J        4574.0
Ideal      D        2629.0
           E        2598.0
           F        3375.0
           G        3721.0
           H        3889.0
           I        4452.0
           J        4918.0
Premium    D        3631.0
           E        3539.0
           F        4325.0
           G        4501.0
           H        5217.0
           I        5946.0
           J        6295.0
Very Good  D        3470.0
           E        3215.0
           F        3779.0
           G        3873.0
           H        4535.0
           I        5256.0
           J        5104.0
Name: price, dtype: float64
df1.reset_index()
df1.reset_index().set_index('color')

df1.reset_index().set_index('color').reset_index()

map、apply、applymap

diamonds.head()

trans = dict(zip(diamonds['cut'].unique().tolist(),['理想','优质','良好','非常好','一般' ]))
diamonds['cut_trans'] = diamonds['cut'].map(trans)
diamonds.head()

def trans2(x):
    return trans[x]
diamonds['cut_trans_apply'] = diamonds['cut'].apply(trans2)  # apply 函数,可以带参数,也可以指定, axis=0
diamonds.head()

# applymap的用法比较简单,会对DataFrame中的每个单元格执行指定函数的操作
df2.applymap(lambda x:"%.1f" % x) # 含有字符串会报错,保留1位小数
df2

loc和iloc

df2.loc['Fair']
color
D    4291.0
E    3682.0
F    3827.0
G    4239.0
H    5136.0
I    4685.0
J    4976.0
Name: Fair, dtype: float64
df2.iloc[0]
color
D    4291.0
E    3682.0
F    3827.0
G    4239.0
H    5136.0
I    4685.0
J    4976.0
Name: Fair, dtype: float64
df2.loc[:,['E']]

df2.iloc[:,[1]]

df2.loc[['Fair','Ideal'],['D','E']]

df2.loc[df2['D']==4291] # 取某一行,取多行匹配用到isin函数