Load Packages
# numerical calculation & data frames
import numpy as np
import pandas as pd
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so
# statistics
import statsmodels.api as sm
R for Data Science by Wickham & Grolemund
# numerical calculation & data frames
import numpy as np
import pandas as pd
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so
# statistics
import statsmodels.api as sm
# pandas options
= 2
pd.options.display.precision = '{:.2f}'.format # pd.reset_option('display.float_format')
pd.options.display.float_format = 7
pd.options.display.max_rows
# Numpy options
= 2, suppress=True) np.set_printoptions(precision
Data: On-time data for all flights that departed NYC (i.e. JFK, LGA or EWR) in 2013
# import the dataset
= sm.datasets.get_rdataset("flights", "nycflights13")
flights_data = flights_data.data
flights = flights.drop(columns="time_hour") # drop the "time_hour" column flights
# Description
print(flights_data.__doc__)
flights
year month day dep_time sched_dep_time dep_delay arr_time \
0 2013 1 1 517.00 515 2.00 830.00
1 2013 1 1 533.00 529 4.00 850.00
2 2013 1 1 542.00 540 2.00 923.00
... ... ... ... ... ... ... ...
336773 2013 9 30 NaN 1210 NaN NaN
336774 2013 9 30 NaN 1159 NaN NaN
336775 2013 9 30 NaN 840 NaN NaN
sched_arr_time arr_delay carrier flight tailnum origin dest \
0 819 11.00 UA 1545 N14228 EWR IAH
1 830 20.00 UA 1714 N24211 LGA IAH
2 850 33.00 AA 1141 N619AA JFK MIA
... ... ... ... ... ... ... ...
336773 1330 NaN MQ 3461 N535MQ LGA BNA
336774 1344 NaN MQ 3572 N511MQ LGA CLE
336775 1020 NaN MQ 3531 N839MQ LGA RDU
air_time distance hour minute
0 227.00 1400 5 15
1 227.00 1416 5 29
2 160.00 1089 5 40
... ... ... ... ...
336773 NaN 764 12 10
336774 NaN 419 11 59
336775 NaN 431 8 40
[336776 rows x 18 columns]
Subsetting options
Bracket안에 labels이 있는 경우 columns을 select
'dest'] # return as a Series flights[
0 IAH
1 IAH
2 MIA
...
336773 BNA
336774 CLE
336775 RDU
Name: dest, Length: 336776, dtype: object
'dest']] # return as a DataFrame flights[[
dest
0 IAH
1 IAH
2 MIA
... ...
336773 BNA
336774 CLE
336775 RDU
[336776 rows x 1 columns]
'origin', 'dest']] flights[[
origin dest
0 EWR IAH
1 LGA IAH
2 JFK MIA
... ... ...
336773 LGA BNA
336774 LGA CLE
336775 LGA RDU
[336776 rows x 2 columns]
Bracket안에 numbers가 있는 경우 rows를 select - position-based
2:5] flights[
year month day dep_time sched_dep_time dep_delay arr_time \
2 2013 1 1 542.00 540 2.00 923.00
3 2013 1 1 544.00 545 -1.00 1004.00
4 2013 1 1 554.00 600 -6.00 812.00
sched_arr_time arr_delay carrier flight tailnum origin dest air_time \
2 850 33.00 AA 1141 N619AA JFK MIA 160.00
3 1022 -18.00 B6 725 N804JB JFK BQN 183.00
4 837 -25.00 DL 461 N668DN LGA ATL 116.00
distance hour minute
2 1089 5 40
3 1576 5 45
4 762 6 0
만약, 아래와 같이 index가 number일 때 out of order가 된 경우에도 row position으로 적용됨
origin dest arr_delay
42 LGA DFW 48.00
2 JFK MIA 33.00
25 EWR ORD 32.00
14 LGA DFW 31.00
33 EWR MSP 29.00
2:4] df_outoforder[
origin dest arr_delay
25 EWR ORD 32.00
14 LGA DFW 31.00
Chaining with brackets
'origin', 'dest']][2:5]
flights[[# 순서 바꿔어도 동일: flights[2:5][['origin', 'dest']]
origin dest
2 JFK MIA
3 JFK BQN
4 LGA ATL
편리하나 주의해서 사용할 필요가 있음
count
인 경우 df.count
는 df
의 method로 인식df.new_var = 1
불가, 대신 df["new_var"] = 1
vars_names=["origin", "dest"]
,
df[vars_names]
는 "orign"
과 "dest"
columns을 선택df.vars_names
는 vars_names
이라는 이름의 column을 의미# flihgts["dest"]와 동일 flights.dest
0 IAH
1 IAH
2 MIA
...
336773 BNA
336774 CLE
336775 RDU
Name: dest, Length: 336776, dtype: object
각각 location, integer location의 약자
df.(i)loc[row_indexer, column_indexer]
2:5, ['origin', 'dest']] # 2:5는 index의 label, not position flights.loc[
origin dest
2 JFK MIA
3 JFK BQN
4 LGA ATL
5 EWR ORD
다음과 같이 index가 labels인 경우는 혼동의 염려 없음
origin dest
red JFK MIA
blue JFK BQN
green LGA ATL
yellow EWR ORD
"blue":"green", :] df_labels.loc[
origin dest
blue JFK BQN
green LGA ATL
하지만, index가 number인 경우는 혼동이 있음
앞서 본 예에서처럼 index가 out of order인 경우 loc은 다르게 작동
origin dest arr_delay
42 LGA DFW 48.00
2 JFK MIA 33.00
25 EWR ORD 32.00
14 LGA DFW 31.00
33 EWR MSP 29.00
2:14, :] # position 아님 df_outoforder.loc[
origin dest arr_delay
2 JFK MIA 33.00
25 EWR ORD 32.00
14 LGA DFW 31.00
25, 33], :] # slicing이 아닌 특정 index 선택 df_outoforder.loc[[
origin dest arr_delay
25 EWR ORD 32.00
33 EWR MSP 29.00
2:5, 'dest'] # returns as a Series flights.loc[
2 MIA
3 BQN
4 ATL
5 ORD
Name: dest, dtype: object
2:5, ['dest']] # return as a DataFrame flights.loc[
dest
2 MIA
3 BQN
4 ATL
5 ORD
생략
2:5, :] # ':' means all
flights.loc[2:5]
flights.loc[2:5, ] flights.loc[
# select a single row
2, :] # returns as a Series, column names as its index flights.loc[
year 2013
month 1
day 1
...
distance 1089
hour 5
minute 40
Name: 2, Length: 18, dtype: object
# select a single row
2], :] # returns as a DataFrame flights.loc[[
year month day dep_time sched_dep_time dep_delay arr_time \
2 2013 1 1 542.00 540 2.00 923.00
sched_arr_time arr_delay carrier flight tailnum origin dest air_time \
2 850 33.00 AA 1141 N619AA JFK MIA 160.00
distance hour minute
2 1089 5 40
2:5, 12:14] # 2:5는 index의 position, last index는 미포함 flights.iloc[
origin dest
2 JFK MIA
3 JFK BQN
4 LGA ATL
2:5, 12] # return as a Series flights.iloc[
2 JFK
3 JFK
4 LGA
Name: origin, dtype: object
2:5, :]
flights.iloc[# 다음 모두 가능
# flights.iloc[2:5]
# flights.iloc[2:5, ]
year month day dep_time sched_dep_time dep_delay arr_time \
2 2013 1 1 542.00 540 2.00 923.00
3 2013 1 1 544.00 545 -1.00 1004.00
4 2013 1 1 554.00 600 -6.00 812.00
sched_arr_time arr_delay carrier flight tailnum origin dest air_time \
2 850 33.00 AA 1141 N619AA JFK MIA 160.00
3 1022 -18.00 B6 725 N804JB JFK BQN 183.00
4 837 -25.00 DL 461 N668DN LGA ATL 116.00
distance hour minute
2 1089 5 40
3 1576 5 45
4 762 6 0
2:5, [12]] # return as a DataFrame flights.iloc[
origin
2 JFK
3 JFK
4 LGA
2, 5, 7], 12:14] # 특정 위치의 rows 선택 flights.iloc[[
origin dest
2 JFK MIA
5 EWR ORD
7 LGA IAD
단 하나의 scalar 값을 추출할 때, 빠른 처리를 하는 다음을 사용할 수 있음
.at[i, j]
, .iat[i, j]
DataFrame과 같은 방식으로 이해
Index가 numbers인 경우
42 DFW
2 MIA
25 ORD
14 DFW
33 MSP
Name: dest, dtype: object
25:14] s.loc[
25 ORD
14 DFW
Name: dest, dtype: object
2:4] s.iloc[
25 ORD
14 DFW
Name: dest, dtype: object
3] s[:
42 DFW
2 MIA
25 ORD
Name: dest, dtype: object
다음과 같은 경우 혼동스러움
3] # 3번째? label 3? s[
#> errors occur
Index가 lables인 경우
red MIA
blue BQN
green ATL
yellow ORD
Name: dest, dtype: object
"red", "green"]] s[[
red MIA
green ATL
Name: dest, dtype: object
123)
np.random.seed(= flights[:100][["dep_delay", "arr_delay", "origin", "dest"]].sample(6)
flights_6 flights_6
dep_delay arr_delay origin dest
8 -3.00 -8.00 JFK MCO
70 9.00 20.00 LGA ORD
82 -1.00 -26.00 JFK SFO
28 0.00 -21.00 JFK SJU
63 -2.00 2.00 JFK LAX
0 2.00 11.00 EWR IAH
"dep_delay"] < 0] flights_6[flights_6[
dep_delay arr_delay origin dest
8 -3.00 -8.00 JFK MCO
82 -1.00 -26.00 JFK SFO
63 -2.00 2.00 JFK LAX
= flights_6["dep_delay"] < 0
idx # bool type의 Series idx
8 True
70 False
82 True
28 False
63 True
0 False
Name: dep_delay, dtype: bool
# Select a column with the boolean indexing
"dest"] flights_6[idx][
8 MCO
82 SFO
63 LAX
Name: dest, dtype: object
사실, boolean indexing을 할때, DataFrame/Series의 index와 match함
대부분 염려하지 않아도 되나 다음과 같은 결과 참고
# Reset index
= idx.reset_index(drop=True)
idx_reset # 0 True
# 1 False
# 2 True
# 3 False
# 4 True
# 5 False
# Name: dep_delay, dtype: bool
"dest"]
flights_6[idx_reset][#> IndexingError: Unalignable boolean Series provided as indexer
#> (index of the boolean Series and of the indexed object do not match)
# Index가 없는 numpy array로 boolean indexing을 하는 경우 문제없음
"dest"]
flights_6[idx_reset.to_numpy()][# 8 MCO
# 82 SFO
# 63 LAX
# Name: dest, dtype: object
= flights_6[["dep_delay", "arr_delay"]] > 0
bool_idx bool_idx
dep_delay arr_delay
8 False False
70 True True
82 False False
28 False False
63 False True
0 True True
= bool_idx.any(axis=1)
idx_any idx_any
8 False
70 True
82 False
28 False
63 True
0 True
dtype: bool
all(axis=1) bool_idx.
8 False
70 True
82 False
28 False
63 False
0 True
dtype: bool
np.where()
활용np.where(
boolean condition, value if True, value if False)
"delayed"] = np.where(idx, "delayed", "on-time")
flights_6[ flights_6
dep_delay arr_delay origin dest delayed
8 -3.00 -8.00 JFK MCO delayed
70 9.00 20.00 LGA ORD on-time
82 -1.00 -26.00 JFK SFO delayed
28 0.00 -21.00 JFK SJU on-time
63 -2.00 2.00 JFK LAX delayed
0 2.00 11.00 EWR IAH on-time
"dest"].str.startswith("S"), "S", "T") # str method: "S"로 시작하는지 여부 np.where(flights_6[
array(['T', 'T', 'S', 'S', 'T', 'T'], dtype='<U1')
"dest"] # flights_6[idx]["dest"]과 동일 flights_6.loc[idx,
8 MCO
82 SFO
63 LAX
Name: dest, dtype: object
만약 column 이름에 “time”을 포함하는 columns만 선택하고자 하면
Series/Index object는 str method 존재
str.contains()
,str.startswith()
,str.endswith()
자세한 사항은 7.4 String Manipulation/String Functions in pandas by Wes McKinney
= flights.columns.str.contains("time") # str method: "time"을 포함하는지 여부
cols cols
array([False, False, False, True, True, False, True, True, False,
False, False, False, False, False, True, False, False, False])
# Columns 쪽으로 boolean indexing
flights.loc[:, cols]
dep_time sched_dep_time arr_time sched_arr_time air_time
0 517.00 515 830.00 819 227.00
1 533.00 529 850.00 830 227.00
2 542.00 540 923.00 850 160.00
... ... ... ... ... ...
336773 NaN 1210 NaN 1330 NaN
336774 NaN 1159 NaN 1344 NaN
336775 NaN 840 NaN 1020 NaN
[336776 rows x 5 columns]
Chained indexing으로 값을 assign하는 경우 copy vs. view 경고 메세지
"arr_delay"] < 0]["arr_delay"] = 0 flights[flights[
/var/folders/mp/vcywncl97ml2q4c_5k2r573m0000gn/T/ipykernel_96692/3780864177.py:1
: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
경고가 제시하는데로 .loc을 이용하여 assign
"arr_delay"] < 0, "arr_delay"] = 0 flights.loc[flights[
df[["var1", "var2"]]
df[:10]
loc[:, ["var1", "var2"]]
는 df[["var1", "var2"]]
과 동일iloc[:10, :]
은 df[:10]
와 동일df[bool_idx]
df.loc[bool_idx, :]
df[:5]["dest"]
df.loc[:4, "dest"]
: index가 0부터 정렬되어 있다고 가정했을 때, slicing에서 위치 하나 차이남df.iloc[:5, 13]
: “dest”의 column 위치 13df["var1"]
또는 df.loc[2, :]
Numpy의 indexing에 대해서는 교재 참고
Ch.4/Basic Indexing and Slicing in Python Data Analysis by Wes McKinney