# numerical calculation & data framesimport numpy as npimport pandas as pd# visualizationimport matplotlib.pyplot as pltimport seaborn as snsimport seaborn.objects as soimport plotly.express as px# statisticsimport statsmodels.api as sm
# load a datasettips = sns.load_dataset("tips")tips
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
.. ... ... ... ... ... ... ...
240 27.18 2.00 Female Yes Sat Dinner 2
241 22.67 2.00 Male Yes Sat Dinner 2
242 17.82 1.75 Male No Sat Dinner 2
243 18.78 3.00 Female No Thur Dinner 2
[244 rows x 7 columns]
# DataFrame의 값들: ndarraytips.values # or tips.to_numpy()
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
total_bill tip size
count 244.00 244.00 244.00
mean 19.79 3.00 2.57
std 8.90 1.38 0.95
min 3.07 1.00 1.00
25% 13.35 2.00 2.00
50% 17.80 2.90 2.00
75% 24.13 3.56 3.00
max 50.81 10.00 6.00
tips.describe(include="all") # all types 나열
total_bill tip sex smoker day time size
count 244.00 244.00 244 244 244 244 244.00
unique NaN NaN 2 2 4 2 NaN
top NaN NaN Male No Sat Dinner NaN
freq NaN NaN 157 151 87 176 NaN
... ... ... ... ... ... ... ...
25% 13.35 2.00 NaN NaN NaN NaN 2.00
50% 17.80 2.90 NaN NaN NaN NaN 2.00
75% 24.13 3.56 NaN NaN NaN NaN 3.00
max 50.81 10.00 NaN NaN NaN NaN 6.00
[11 rows x 7 columns]
tips.describe(include="category")
sex smoker day time
count 244 244 244 244
unique 2 2 4 2
top Male No Sat Dinner
freq 157 151 87 176
s1 = tips["day"].value_counts() # "day" 칼럼을 선택 후 각 카테고리별 countss2 = tips["day"].value_counts(sort=False) # default: sort is trues3 = tips["day"].value_counts(normalize=True) # 카테고리별 비율s4 = tips[["sex", "smoker"]].value_counts() # "sex", "smoker" 칼럼을 선택 후 유니크한 카테고리별 counts
species island sex
count 344 344 333
unique 3 3 2
top Adelie Biscoe Male
freq 152 168 168
penguins[["island", "species"]].value_counts()
island species
Biscoe Gentoo 124
Dream Chinstrap 68
Adelie 56
Torgersen Adelie 52
Biscoe Adelie 44
dtype: int64
penguins[["sex", "species"]].value_counts(dropna=False) # NA은 기본적으로 생략
sex species
Female Adelie 73
Male Adelie 73
Gentoo 61
Female Gentoo 58
Chinstrap 34
Male Chinstrap 34
NaN Adelie 6
Gentoo 5
dtype: int64
penguins.isna().sum() # NA의 개수
species 0
island 0
bill_length_mm 2
bill_depth_mm 2
flipper_length_mm 2
body_mass_g 2
sex 11
dtype: int64
tips.sort_values("tip", ascending=False)
total_bill tip sex smoker day time size
170 50.81 10.00 Male Yes Sat Dinner 3
212 48.33 9.00 Male No Sat Dinner 4
23 39.42 7.58 Male No Sat Dinner 4
59 48.27 6.73 Male No Sat Dinner 4
.. ... ... ... ... ... ... ...
236 12.60 1.00 Male Yes Sat Dinner 2
111 7.25 1.00 Female No Sat Dinner 1
67 3.07 1.00 Female Yes Sat Dinner 1
92 5.75 1.00 Female Yes Fri Dinner 2
[244 rows x 7 columns]
total_bill tip sex smoker day time size
125 29.80 4.20 Female No Thur Lunch 6
143 27.05 5.00 Female No Thur Lunch 6
156 48.17 5.00 Male No Sun Dinner 6
141 34.30 6.70 Male No Thur Lunch 6
.. ... ... ... ... ... ... ...
67 3.07 1.00 Female Yes Sat Dinner 1
111 7.25 1.00 Female No Sat Dinner 1
82 10.07 1.83 Female No Thur Lunch 1
222 8.58 1.92 Male Yes Fri Lunch 1
[244 rows x 7 columns]