Load Packages
# numerical calculation & data frames
import numpy as np
import pandas as pd
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so
# statistics
import statsmodels.api as smR for Data Science by Wickham & Grolemund
# numerical calculation & data frames
import numpy as np
import pandas as pd
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so
# statistics
import statsmodels.api as sm# pandas options
pd.set_option("mode.copy_on_write", True)
pd.options.display.precision = 2
pd.options.display.float_format = '{:.2f}'.format # pd.reset_option('display.float_format')
pd.options.display.max_rows = 7
# Numpy options
np.set_printoptions(precision = 2, suppress=True)# Load a datdset
tips = sns.load_dataset("tips")
tips total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
.. ... ... ... ... ... ... ...
241 22.67 2.00 Male Yes Sat Dinner 2
242 17.82 1.75 Male No Sat Dinner 2
243 18.78 3.00 Female No Thur Dinner 2
[244 rows x 7 columns]
pd.crosstab()두 카테고리 변수의 모든 level 쌍에 대한 count.
normalize: 비율을 계산margins: 행과 열로 합groupby()를 적용해 구할 수 있으나 좀 더 간결.
pd.crosstab(tips["day"], tips["time"])time Lunch Dinner
day
Thur 61 1
Fri 7 12
Sat 0 87
Sun 0 76
# groupby를 이용
tips.groupby(["day", "time"]).size()day time
Thur Lunch 61
Dinner 1
Fri Lunch 7
..
Sat Dinner 87
Sun Lunch 0
Dinner 76
Length: 8, dtype: int64
# groupby & unstack를 이용
tips.groupby(["day", "time"]).size().unstack()time Lunch Dinner
day
Thur 61 1
Fri 7 12
Sat 0 87
Sun 0 76
Parameters
normalize: 비율을 계산 (index, columns, all)margins: 행과 열로 합산pd.crosstab(tips["day"], tips["time"], normalize=True, margins=True)time Lunch Dinner All
day
Thur 0.25 0.00 0.25
Fri 0.03 0.05 0.08
Sat 0.00 0.36 0.36
Sun 0.00 0.31 0.31
All 0.28 0.72 1.00
pivot_table()count 대신 mean
mean 외에 다른 aggregation 함수를 지정할 수 있음
grouping을 할 변수들 (index and/or columns)과 aggregate할 변수 지정 (values)
margins: 행과 열에 따라 합산# grouping을 할 변수들 (index)과 aggregate할 변수 지정
tips.pivot_table(index=["day", "time"], values="tip") # dropna=True tip
day time
Thur Lunch 2.77
Dinner 3.00
Fri Lunch 2.38
Dinner 2.94
Sat Dinner 2.99
Sun Dinner 3.26
# groupby를 이용
tips.groupby(["day", "time"])["tip"].mean()day time
Thur Lunch 2.77
Dinner 3.00
Fri Lunch 2.38
...
Sat Dinner 2.99
Sun Lunch NaN
Dinner 3.26
Name: tip, Length: 8, dtype: float64
# grouping을 할 변수들 (index & columns)과 aggregate할 변수 지정
tips.pivot_table(index="day", columns="time", values="tip")time Lunch Dinner
day
Thur 2.77 3.00
Fri 2.38 2.94
Sat NaN 2.99
Sun NaN 3.26
# groupby를 이용
tips.groupby(["day", "time"])["tip"].mean().unstack()time Lunch Dinner
day
Thur 2.77 3.00
Fri 2.38 2.94
Sat NaN 2.99
Sun NaN 3.26
# 두 개 이상의 변수에 대한 aggregation
tips.pivot_table(index="day", columns="time", values=["tip", "total_bill"]) tip total_bill
time Lunch Dinner Lunch Dinner
day
Thur 2.77 3.00 17.66 18.78
Fri 2.38 2.94 12.85 19.66
Sat NaN 2.99 NaN 20.44
Sun NaN 3.26 NaN 21.41
# groupby를 이용
tips.groupby(["day", "time"])[["tip", "total_bill"]].mean().unstack() tip total_bill
time Lunch Dinner Lunch Dinner
day
Thur 2.77 3.00 17.66 18.78
Fri 2.38 2.94 12.85 19.66
Sat NaN 2.99 NaN 20.44
Sun NaN 3.26 NaN 21.41
# margins 추가
tips.pivot_table(index="day", columns="time", values="tip", margins=True)time Lunch Dinner All
day
Thur 2.77 3.00 2.77
Fri 2.38 2.94 2.73
Sat NaN 2.99 2.99
Sun NaN 3.26 3.26
All 2.73 3.10 3.00