Discretize

R for Data Science by Wickham & Grolemund

Author

Sungkyun Cho

Published

February 11, 2023

Load Packages
# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

# statistics
import statsmodels.api as sm
Options
# pandas options
pd.set_option("mode.copy_on_write", True)
pd.options.display.precision = 2
pd.options.display.float_format = '{:.2f}'.format  # pd.reset_option('display.float_format')
pd.options.display.max_rows = 8

# Numpy options
np.set_printoptions(precision = 2, suppress=True)
# import a dataset
diamonds = sm.datasets.get_rdataset("diamonds", "ggplot2").data
diamonds2 = diamonds.copy()  # unmodified copy

Categorical type in pandas

  • 일종의 데이터 압축 기술이며, 적은 levels로만 이루어진 데이터의 경우 메모리 절약
  • 순서가 부여되어, sorting을 하거나 min/max 함수가 알파벳 순서가 아닌 부여된 순서를 적용
  • 다른 Python library들 중에는 이 순서를 활용하는 것이 있음; 통계적 분석이나 플랏을 그릴 때

McKinney’s/Categorical data
pandas/Categorical data

pd.Categorical(diamonds["cut"])  # defualt: alphabetical order
['Ideal', 'Premium', 'Good', 'Premium', 'Good', ..., 'Ideal', 'Good', 'Very Good', 'Premium', 'Ideal']
Length: 53940
Categories (5, object): ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good']
diamonds2["cut"].astype("category")  # defualt: alphabetical order
0            Ideal
1          Premium
2             Good
3          Premium
           ...    
53936         Good
53937    Very Good
53938      Premium
53939        Ideal
Name: cut, Length: 53940, dtype: category
Categories (5, object): ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good']

카테고리의 순서를 지정

diamonds["cut"] = pd.Categorical(
    diamonds["cut"], 
    categories=["Fair", "Good", "Very Good", "Premium", "Ideal"],
    ordered=True
)
# .astype() method를 쓰려면,
from pandas.api.types import CategoricalDtype

cat_type = CategoricalDtype(
    categories=["Fair", "Good", "Very Good", "Premium", "Ideal"], ordered=True
)
diamonds["cut"] = diamonds["cut"].astype(cat_type)
diamonds["cut"]
0            Ideal
1          Premium
2             Good
3          Premium
           ...    
53936         Good
53937    Very Good
53938      Premium
53939        Ideal
Name: cut, Length: 53940, dtype: category
Categories (5, object): ['Fair' < 'Good' < 'Very Good' < 'Premium' < 'Ideal']
diamonds["cut"].cat.categories
Index(['Fair', 'Good', 'Very Good', 'Premium', 'Ideal'], dtype='object')
diamonds["cut"].cat.codes
0        4
1        3
2        1
3        3
        ..
53936    1
53937    2
53938    3
53939    4
Length: 53940, dtype: int8

Category 타입의 변수는 데이터에 없는 level을 포함할 수 있음.

diamonds["cut2"] = pd.Categorical(
    diamonds["cut"], 
    categories=["Fair", "Good", "Very Good", "Premium", "Ideal", "Perfect"],
    ordered=True
)
diamonds.value_counts("cut2", sort=False)
cut2
Fair          1610
Good          4906
Very Good    12082
Premium      13791
Ideal        21551
Perfect          0
Name: count, dtype: int64
diamonds.groupby("cut2")["price"].mean()
cut2
Fair        4358.76
Good        3928.86
Very Good   3981.76
Premium     4584.26
Ideal       3457.54
Perfect         NaN
Name: price, dtype: float64

카테고리 순서의 적용

.value_count(), .groupby(), min(), max()
시각화 library: seaborn, pandas

diamonds.value_counts("cut", sort=False)
cut
Fair          1610
Good          4906
Very Good    12082
Premium      13791
Ideal        21551
Name: count, dtype: int64
diamonds.value_counts("cut", sort=False).index
CategoricalIndex(['Fair', 'Good', 'Very Good', 'Premium', 'Ideal'], categories=['Fair', 'Good', 'Very Good', 'Premium', 'Ideal'], ordered=True, dtype='category', name='cut')
# group keys
diamonds.groupby("cut2")["price"].mean()
cut2
Fair        4358.76
Good        3928.86
Very Good   3981.76
Premium     4584.26
Ideal       3457.54
Perfect         NaN
Name: price, dtype: float64
# min(), max()
diamonds["cut"].min()
'Fair'
# sort_values()
diamonds.sort_values("cut")
       carat    cut color clarity  depth  table  price    x    y    z   cut2
4654    1.00   Fair     F     SI1  66.70  57.00   3669 6.07 5.99 4.02   Fair
53338   1.20   Fair     G      I1  64.40  55.00   2655 6.77 6.61 4.31   Fair
40890   0.50   Fair     E     SI1  65.00  58.00   1176 4.98 4.90 3.21   Fair
9129    0.90   Fair     F     VS2  59.50  67.00   4536 6.29 6.24 3.73   Fair
...      ...    ...   ...     ...    ...    ...    ...  ...  ...  ...    ...
29309   0.34  Ideal     F     SI2  61.90  57.00    438 4.50 4.52 2.79  Ideal
29308   0.25  Ideal     G     VS1  62.70  54.00    438 4.05 4.08 2.55  Ideal
29339   0.31  Ideal     G     VS2  59.10  57.00    698 4.48 4.45 2.64  Ideal
53939   0.75  Ideal     D     SI2  62.20  55.00   2757 5.83 5.87 3.64  Ideal

[53940 rows x 11 columns]

Seaborn도 Categorical type을 지원함.

left = so.Plot(diamonds, x="cut2").add(so.Bar(), so.Count())
right = so.Plot(diamonds2, x="cut").add(so.Bar(), so.Count())

sns.boxplot(data=diamonds, x="cut2", y="price")
<Axes: xlabel='cut2', ylabel='price'>

diamonds.boxplot("price", by="cut2")
<Axes: title={'center': 'price'}, xlabel='cut2'>

Discretize/Bining

연속변수를 카테고리화하여 범주형 변수로 변환하여 분석
pd.cut(), pd.qcut()

  • parameters: bins, precision, labels
 # 동일한 길이의 10개 구간
(
    diamonds.assign(carat_cat = lambda x: pd.cut(x.carat, 10))
    .value_counts("carat_cat")
)
carat_cat
(0.195, 0.681]    25155
(0.681, 1.162]    18626
(1.162, 1.643]     7129
(1.643, 2.124]     2349
                  ...  
(3.086, 3.567]        6
(3.567, 4.048]        5
(4.048, 4.529]        2
(4.529, 5.01]         1
Name: count, Length: 10, dtype: int64
# 나누는 구간을 지정
(
    diamonds.assign(carat_cat = lambda x: pd.cut(x.carat, [0, 1, 3, 5]))
    .value_counts("carat_cat")
)
carat_cat
(0, 1]    36438
(1, 3]    17470
(3, 5]       31
Name: count, dtype: int64
# 동일한 갯수의 관측치를 포함하도록 하는 10개의 구간; 구간의 길이가 모두 다름
(
    diamonds.assign(carat_cat = lambda x: pd.qcut(x.carat, 10))
    .value_counts("carat_cat", sort=False)
)
carat_cat
(0.199, 0.31]    6452
(0.31, 0.35]     4606
(0.35, 0.42]     5421
(0.42, 0.53]     5106
                 ... 
(0.9, 1.01]      5078
(1.01, 1.13]     4573
(1.13, 1.51]     6052
(1.51, 5.01]     4635
Name: count, Length: 10, dtype: int64
Note
carat_cat = pd.cut(diamonds["carat"], 3)
carat_cat.dtype
# CategoricalDtype(categories=[(0.195, 1.803], (1.803, 3.407], (3.407, 5.01]], ordered=True)

carat_cat.cat.categories
# IntervalIndex([(0.195, 1.803], (1.803, 3.407], (3.407, 5.01]], dtype='interval[float64, right]')