Load Packages
# numerical calculation & data frames
import numpy as np
import pandas as pd
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so
# statistics
import statsmodels.api as sm
R for Data Science by Wickham & Grolemund
# numerical calculation & data frames
import numpy as np
import pandas as pd
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so
# statistics
import statsmodels.api as sm
# pandas options
"mode.copy_on_write", True)
pd.set_option(= 2
pd.options.display.precision = '{:.2f}'.format # pd.reset_option('display.float_format')
pd.options.display.float_format = 7
pd.options.display.max_rows
# Numpy options
= 2, suppress=True) np.set_printoptions(precision
# Load a datdset
= sns.load_dataset("tips")
tips tips
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
.. ... ... ... ... ... ... ...
241 22.67 2.00 Male Yes Sat Dinner 2
242 17.82 1.75 Male No Sat Dinner 2
243 18.78 3.00 Female No Thur Dinner 2
[244 rows x 7 columns]
pd.crosstab()
두 카테고리 변수의 모든 level 쌍에 대한 count.
normalize
: 비율을 계산margins
: 행과 열로 합groupby()
를 적용해 구할 수 있으나 좀 더 간결.
"day"], tips["time"]) pd.crosstab(tips[
time Lunch Dinner
day
Thur 61 1
Fri 7 12
Sat 0 87
Sun 0 76
# groupby를 이용
"day", "time"]).size() tips.groupby([
day time
Thur Lunch 61
Dinner 1
Fri Lunch 7
..
Sat Dinner 87
Sun Lunch 0
Dinner 76
Length: 8, dtype: int64
# groupby & unstack를 이용
"day", "time"]).size().unstack() tips.groupby([
time Lunch Dinner
day
Thur 61 1
Fri 7 12
Sat 0 87
Sun 0 76
Parameters
normalize
: 비율을 계산 (index
, columns
, all
)margins
: 행과 열로 합산"day"], tips["time"], normalize=True, margins=True) pd.crosstab(tips[
time Lunch Dinner All
day
Thur 0.25 0.00 0.25
Fri 0.03 0.05 0.08
Sat 0.00 0.36 0.36
Sun 0.00 0.31 0.31
All 0.28 0.72 1.00
pivot_table()
count 대신 mean
mean 외에 다른 aggregation 함수를 지정할 수 있음
grouping을 할 변수들 (index
and/or columns
)과 aggregate할 변수 지정 (values
)
margins
: 행과 열에 따라 합산# grouping을 할 변수들 (index)과 aggregate할 변수 지정
=["day", "time"], values="tip") # dropna=True tips.pivot_table(index
tip
day time
Thur Lunch 2.77
Dinner 3.00
Fri Lunch 2.38
Dinner 2.94
Sat Dinner 2.99
Sun Dinner 3.26
# groupby를 이용
"day", "time"])["tip"].mean() tips.groupby([
day time
Thur Lunch 2.77
Dinner 3.00
Fri Lunch 2.38
...
Sat Dinner 2.99
Sun Lunch NaN
Dinner 3.26
Name: tip, Length: 8, dtype: float64
# grouping을 할 변수들 (index & columns)과 aggregate할 변수 지정
="day", columns="time", values="tip") tips.pivot_table(index
time Lunch Dinner
day
Thur 2.77 3.00
Fri 2.38 2.94
Sat NaN 2.99
Sun NaN 3.26
# groupby를 이용
"day", "time"])["tip"].mean().unstack() tips.groupby([
time Lunch Dinner
day
Thur 2.77 3.00
Fri 2.38 2.94
Sat NaN 2.99
Sun NaN 3.26
# 두 개 이상의 변수에 대한 aggregation
="day", columns="time", values=["tip", "total_bill"]) tips.pivot_table(index
tip total_bill
time Lunch Dinner Lunch Dinner
day
Thur 2.77 3.00 17.66 18.78
Fri 2.38 2.94 12.85 19.66
Sat NaN 2.99 NaN 20.44
Sun NaN 3.26 NaN 21.41
# groupby를 이용
"day", "time"])[["tip", "total_bill"]].mean().unstack() tips.groupby([
tip total_bill
time Lunch Dinner Lunch Dinner
day
Thur 2.77 3.00 17.66 18.78
Fri 2.38 2.94 12.85 19.66
Sat NaN 2.99 NaN 20.44
Sun NaN 3.26 NaN 21.41
# margins 추가
="day", columns="time", values="tip", margins=True) tips.pivot_table(index
time Lunch Dinner All
day
Thur 2.77 3.00 2.77
Fri 2.38 2.94 2.73
Sat NaN 2.99 2.99
Sun NaN 3.26 3.26
All 2.73 3.10 3.00