8일차
09 결측값 알아보기¶
판다스에서는 NULL 을 NaN 으로 표기
In [1]:
import pandas as pd
In [2]:
from numpy import NaN, NAN, nan
print(NaN == True)
print(NaN == 0)
print(NaN == '')
print(NaN == NaN)
print(NaN == NAN)
print(NaN == nan)
print(nan== NAN)
False False False False False False False
In [3]:
print(pd.isnull(NaN))
print(pd.isnull(NAN))
print(pd.isnull(nan))
True True True
In [4]:
print(pd.notnull(NaN))
print(pd.notnull(42))
print(pd.notnull('missing'))
False True True
In [5]:
visited_file = '../data/survey_visited.csv'
print(pd.read_csv(visited_file))
ident site dated 0 619 DR-1 1927-02-08 1 622 DR-1 1927-02-10 2 734 DR-3 1939-01-07 3 735 DR-3 1930-01-12 4 751 DR-3 1930-02-26 5 752 DR-3 NaN 6 837 MSK-4 1932-01-14 7 844 DR-1 1932-03-22
In [6]:
print(pd.read_csv(visited_file, keep_default_na=False))
ident site dated 0 619 DR-1 1927-02-08 1 622 DR-1 1927-02-10 2 734 DR-3 1939-01-07 3 735 DR-3 1930-01-12 4 751 DR-3 1930-02-26 5 752 DR-3 6 837 MSK-4 1932-01-14 7 844 DR-1 1932-03-22
In [7]:
print(pd.read_csv(visited_file, na_values=[""], keep_default_na=False))
ident site dated 0 619 DR-1 1927-02-08 1 622 DR-1 1927-02-10 2 734 DR-3 1939-01-07 3 735 DR-3 1930-01-12 4 751 DR-3 1930-02-26 5 752 DR-3 NaN 6 837 MSK-4 1932-01-14 7 844 DR-1 1932-03-22
In [8]:
visited = pd.read_csv('../data/survey_visited.csv')
survey = pd.read_csv('../data/survey_survey.csv')
print(visited)
ident site dated 0 619 DR-1 1927-02-08 1 622 DR-1 1927-02-10 2 734 DR-3 1939-01-07 3 735 DR-3 1930-01-12 4 751 DR-3 1930-02-26 5 752 DR-3 NaN 6 837 MSK-4 1932-01-14 7 844 DR-1 1932-03-22
In [9]:
print(survey)
taken person quant reading 0 619 dyer rad 9.82 1 619 dyer sal 0.13 2 622 dyer rad 7.80 3 622 dyer sal 0.09 4 734 pb rad 8.41 5 734 lake sal 0.05 6 734 pb temp -21.50 7 735 pb rad 7.22 8 735 NaN sal 0.06 9 735 NaN temp -26.00 10 751 pb rad 4.35 11 751 pb temp -18.50 12 751 lake sal 0.10 13 752 lake rad 2.19 14 752 lake sal 0.09 15 752 lake temp -16.00 16 752 roe sal 41.60 17 837 lake rad 1.46 18 837 lake sal 0.21 19 837 roe sal 22.50 20 844 roe rad 11.25
In [10]:
vs = visited.merge(survey, left_on='ident', right_on='taken')
print(vs)
ident site dated taken person quant reading 0 619 DR-1 1927-02-08 619 dyer rad 9.82 1 619 DR-1 1927-02-08 619 dyer sal 0.13 2 622 DR-1 1927-02-10 622 dyer rad 7.80 3 622 DR-1 1927-02-10 622 dyer sal 0.09 4 734 DR-3 1939-01-07 734 pb rad 8.41 5 734 DR-3 1939-01-07 734 lake sal 0.05 6 734 DR-3 1939-01-07 734 pb temp -21.50 7 735 DR-3 1930-01-12 735 pb rad 7.22 8 735 DR-3 1930-01-12 735 NaN sal 0.06 9 735 DR-3 1930-01-12 735 NaN temp -26.00 10 751 DR-3 1930-02-26 751 pb rad 4.35 11 751 DR-3 1930-02-26 751 pb temp -18.50 12 751 DR-3 1930-02-26 751 lake sal 0.10 13 752 DR-3 NaN 752 lake rad 2.19 14 752 DR-3 NaN 752 lake sal 0.09 15 752 DR-3 NaN 752 lake temp -16.00 16 752 DR-3 NaN 752 roe sal 41.60 17 837 MSK-4 1932-01-14 837 lake rad 1.46 18 837 MSK-4 1932-01-14 837 lake sal 0.21 19 837 MSK-4 1932-01-14 837 roe sal 22.50 20 844 DR-1 1932-03-22 844 roe rad 11.25
In [11]:
num_legs = pd.Series({'goat': 4, 'amoeba': nan})
print(num_legs)
goat 4.0 amoeba NaN dtype: float64
In [13]:
scientists = pd.DataFrame(
{
"Name": ["Rosaline Franklin", "William Gosset"],
"Occupation": ["Chemist","Statistician"],
"Born": ["1990-07-25", "1876-06-13"],
"Died": ["1958-04-16", "1937-10-16"],
"missing": [NaN, nan],
}
)
print(scientists)
Name Occupation Born Died missing 0 Rosaline Franklin Chemist 1990-07-25 1958-04-16 NaN 1 William Gosset Statistician 1876-06-13 1937-10-16 NaN
In [14]:
print(scientists.dtypes)
Name object Occupation object Born object Died object missing float64 dtype: object
In [15]:
gapminder = pd.read_csv('../data/gapminder.tsv', sep='\t')
life_exp = gapminder.groupby('year')['lifeExp'].mean()
print(life_exp)
year 1952 49.057620 1957 51.507401 1962 53.609249 1967 55.678290 1972 57.647386 1977 59.570157 1982 61.533197 1987 63.212613 1992 64.160338 1997 65.014676 2002 65.694923 2007 67.007423 Name: lifeExp, dtype: float64
In [17]:
y2000 = life_exp[life_exp.index > 2000]
print(y2000)
year 2002 65.694923 2007 67.007423 Name: lifeExp, dtype: float64
In [18]:
print(y2000.reindex(range(2000, 2010)))
year 2000 NaN 2001 NaN 2002 65.694923 2003 NaN 2004 NaN 2005 NaN 2006 NaN 2007 67.007423 2008 NaN 2009 NaN Name: lifeExp, dtype: float64
In [19]:
ebola = pd.read_csv('../data/country_timeseries.csv')
print(ebola.count())
Date 122 Day 122 Cases_Guinea 93 Cases_Liberia 83 Cases_SierraLeone 87 Cases_Nigeria 38 Cases_Senegal 25 Cases_UnitedStates 18 Cases_Spain 16 Cases_Mali 12 Deaths_Guinea 92 Deaths_Liberia 81 Deaths_SierraLeone 87 Deaths_Nigeria 38 Deaths_Senegal 22 Deaths_UnitedStates 18 Deaths_Spain 16 Deaths_Mali 12 dtype: int64
In [20]:
num_rows = ebola.shape[0]
num_missing = num_rows - ebola.count()
print(num_missing)
Date 0 Day 0 Cases_Guinea 29 Cases_Liberia 39 Cases_SierraLeone 35 Cases_Nigeria 84 Cases_Senegal 97 Cases_UnitedStates 104 Cases_Spain 106 Cases_Mali 110 Deaths_Guinea 30 Deaths_Liberia 41 Deaths_SierraLeone 35 Deaths_Nigeria 84 Deaths_Senegal 100 Deaths_UnitedStates 104 Deaths_Spain 106 Deaths_Mali 110 dtype: int64
In [21]:
import numpy as np
print(np.count_nonzero(ebola.isnull()))
1214
In [22]:
print(np.count_nonzero(ebola['Cases_Guinea'].isnull()))
29
In [23]:
cnts = ebola.Cases_Guinea.value_counts(dropna=False)
print(cnts)
Cases_Guinea NaN 29 86.0 3 495.0 2 112.0 2 390.0 2 .. 1199.0 1 1298.0 1 1350.0 1 1472.0 1 49.0 1 Name: count, Length: 89, dtype: int64
In [24]:
print(cnts.loc[pd.isnull(cnts.index)])
Cases_Guinea NaN 29 Name: count, dtype: int64
In [25]:
print(ebola.Cases_Guinea.isnull().sum())
29
In [26]:
print(ebola.fillna(0).iloc[:, 0:5])
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone 0 1/5/2015 289 2776.0 0.0 10030.0 1 1/4/2015 288 2775.0 0.0 9780.0 2 1/3/2015 287 2769.0 8166.0 9722.0 3 1/2/2015 286 0.0 8157.0 0.0 4 12/31/2014 284 2730.0 8115.0 9633.0 .. ... ... ... ... ... 117 3/27/2014 5 103.0 8.0 6.0 118 3/26/2014 4 86.0 0.0 0.0 119 3/25/2014 3 86.0 0.0 0.0 120 3/24/2014 2 86.0 0.0 0.0 121 3/22/2014 0 49.0 0.0 0.0 [122 rows x 5 columns]
In [27]:
print(ebola.fillna(method='ffill').iloc[:, 0:5])
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone 0 1/5/2015 289 2776.0 NaN 10030.0 1 1/4/2015 288 2775.0 NaN 9780.0 2 1/3/2015 287 2769.0 8166.0 9722.0 3 1/2/2015 286 2769.0 8157.0 9722.0 4 12/31/2014 284 2730.0 8115.0 9633.0 .. ... ... ... ... ... 117 3/27/2014 5 103.0 8.0 6.0 118 3/26/2014 4 86.0 8.0 6.0 119 3/25/2014 3 86.0 8.0 6.0 120 3/24/2014 2 86.0 8.0 6.0 121 3/22/2014 0 49.0 8.0 6.0 [122 rows x 5 columns]
In [28]:
print(ebola.fillna(method='bfill').iloc[:, 0:5])
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone 0 1/5/2015 289 2776.0 8166.0 10030.0 1 1/4/2015 288 2775.0 8166.0 9780.0 2 1/3/2015 287 2769.0 8166.0 9722.0 3 1/2/2015 286 2730.0 8157.0 9633.0 4 12/31/2014 284 2730.0 8115.0 9633.0 .. ... ... ... ... ... 117 3/27/2014 5 103.0 8.0 6.0 118 3/26/2014 4 86.0 NaN NaN 119 3/25/2014 3 86.0 NaN NaN 120 3/24/2014 2 86.0 NaN NaN 121 3/22/2014 0 49.0 NaN NaN [122 rows x 5 columns]
In [29]:
print(ebola.interpolate().iloc[:, 0:5])
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone 0 1/5/2015 289 2776.0 NaN 10030.0 1 1/4/2015 288 2775.0 NaN 9780.0 2 1/3/2015 287 2769.0 8166.0 9722.0 3 1/2/2015 286 2749.5 8157.0 9677.5 4 12/31/2014 284 2730.0 8115.0 9633.0 .. ... ... ... ... ... 117 3/27/2014 5 103.0 8.0 6.0 118 3/26/2014 4 86.0 8.0 6.0 119 3/25/2014 3 86.0 8.0 6.0 120 3/24/2014 2 86.0 8.0 6.0 121 3/22/2014 0 49.0 8.0 6.0 [122 rows x 5 columns]
In [30]:
print(ebola.shape)
(122, 18)
In [34]:
ebola_dropna = ebola.dropna()
print(ebola_dropna.shape)
(1, 18)
In [35]:
ebola["Cases_multiple"] = (
ebola["Cases_Guinea"]
+ ebola["Cases_Liberia"]
+ ebola["Cases_SierraLeone"]
)
In [36]:
ebola_subset = ebola.loc[:, ["Cases_Guinea", "Cases_Liberia", "Cases_SierraLeone", "Cases_multiple"]]
print(ebola_subset.head(n=10))
Cases_Guinea Cases_Liberia Cases_SierraLeone Cases_multiple 0 2776.0 NaN 10030.0 NaN 1 2775.0 NaN 9780.0 NaN 2 2769.0 8166.0 9722.0 20657.0 3 NaN 8157.0 NaN NaN 4 2730.0 8115.0 9633.0 20478.0 5 2706.0 8018.0 9446.0 20170.0 6 2695.0 NaN 9409.0 NaN 7 2630.0 7977.0 9203.0 19810.0 8 2597.0 NaN 9004.0 NaN 9 2571.0 7862.0 8939.0 19372.0
In [37]:
print(ebola.Cases_Guinea.sum(skipna=True))
84729.0
In [38]:
print(ebola.Cases_Guinea.sum(skipna=False))
nan
In [39]:
scientists.loc[1, "Name"] = pd.NA
scientists.loc[1, "Age"] = pd.NA
print(scientists)
Name Occupation Born Died missing Age 0 Rosaline Franklin Chemist 1990-07-25 1958-04-16 NaN NaN 1 <NA> Statistician 1876-06-13 1937-10-16 NaN NaN
In [40]:
print(scientists.dtypes)
Name object Occupation object Born object Died object missing float64 Age float64 dtype: object
10장 자료형 더 알아보기¶
In [1]:
import pandas as pd
import seaborn as sns
tips = sns.load_dataset("tips")
In [2]:
print(tips.dtypes)
total_bill float64 tip float64 sex category smoker category day category time category size int64 dtype: object
In [3]:
tips['sex_str'] = tips['sex'].astype('str')
print(tips.dtypes)
total_bill float64 tip float64 sex category smoker category day category time category size int64 sex_str object dtype: object
In [4]:
tips['total_bill'] = tips['total_bill'].astype('str')
print(tips.dtypes)
total_bill object tip float64 sex category smoker category day category time category size int64 sex_str object dtype: object
In [5]:
tips['total_bill'] = tips['total_bill'].astype('float')
print(tips.dtypes)
total_bill float64 tip float64 sex category smoker category day category time category size int64 sex_str object dtype: object
In [6]:
tips_sub_miss = tips.head(10).copy()
tips_sub_miss.loc[[1,3,5,7], 'total_bill'] = 'missing'
print(tips_sub_miss)
total_bill tip sex smoker day time size sex_str 0 16.99 1.01 Female No Sun Dinner 2 Female 1 missing 1.66 Male No Sun Dinner 3 Male 2 21.01 3.50 Male No Sun Dinner 3 Male 3 missing 3.31 Male No Sun Dinner 2 Male 4 24.59 3.61 Female No Sun Dinner 4 Female 5 missing 4.71 Male No Sun Dinner 4 Male 6 8.77 2.00 Male No Sun Dinner 2 Male 7 missing 3.12 Male No Sun Dinner 4 Male 8 15.04 1.96 Male No Sun Dinner 2 Male 9 14.78 3.23 Male No Sun Dinner 2 Male
In [7]:
print(tips_sub_miss.dtypes)
total_bill object tip float64 sex category smoker category day category time category size int64 sex_str object dtype: object
In [8]:
pd.to_numeric(tips_sub_miss['total_bill'])
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) File ~\anaconda3\Lib\site-packages\pandas\_libs\lib.pyx:2280, in pandas._libs.lib.maybe_convert_numeric() ValueError: Unable to parse string "missing" During handling of the above exception, another exception occurred: ValueError Traceback (most recent call last) Cell In[8], line 1 ----> 1 pd.to_numeric(tips_sub_miss['total_bill']) File ~\anaconda3\Lib\site-packages\pandas\core\tools\numeric.py:217, in to_numeric(arg, errors, downcast, dtype_backend) 215 coerce_numeric = errors not in ("ignore", "raise") 216 try: --> 217 values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload] # noqa 218 values, 219 set(), 220 coerce_numeric=coerce_numeric, 221 convert_to_masked_nullable=dtype_backend is not lib.no_default 222 or isinstance(values_dtype, StringDtype), 223 ) 224 except (ValueError, TypeError): 225 if errors == "raise": File ~\anaconda3\Lib\site-packages\pandas\_libs\lib.pyx:2322, in pandas._libs.lib.maybe_convert_numeric() ValueError: Unable to parse string "missing" at position 1
In [9]:
tips_sub_miss['total_bill'] = pd.to_numeric(tips_sub_miss['total_bill'], errors='ignore')
print(tips_sub_miss)
total_bill tip sex smoker day time size sex_str 0 16.99 1.01 Female No Sun Dinner 2 Female 1 missing 1.66 Male No Sun Dinner 3 Male 2 21.01 3.50 Male No Sun Dinner 3 Male 3 missing 3.31 Male No Sun Dinner 2 Male 4 24.59 3.61 Female No Sun Dinner 4 Female 5 missing 4.71 Male No Sun Dinner 4 Male 6 8.77 2.00 Male No Sun Dinner 2 Male 7 missing 3.12 Male No Sun Dinner 4 Male 8 15.04 1.96 Male No Sun Dinner 2 Male 9 14.78 3.23 Male No Sun Dinner 2 Male
In [10]:
print(tips_sub_miss.dtypes)
total_bill object tip float64 sex category smoker category day category time category size int64 sex_str object dtype: object
In [11]:
tips_sub_miss['total_bill'] = pd.to_numeric(tips_sub_miss['total_bill'], errors='coerce')
print(tips_sub_miss)
total_bill tip sex smoker day time size sex_str 0 16.99 1.01 Female No Sun Dinner 2 Female 1 NaN 1.66 Male No Sun Dinner 3 Male 2 21.01 3.50 Male No Sun Dinner 3 Male 3 NaN 3.31 Male No Sun Dinner 2 Male 4 24.59 3.61 Female No Sun Dinner 4 Female 5 NaN 4.71 Male No Sun Dinner 4 Male 6 8.77 2.00 Male No Sun Dinner 2 Male 7 NaN 3.12 Male No Sun Dinner 4 Male 8 15.04 1.96 Male No Sun Dinner 2 Male 9 14.78 3.23 Male No Sun Dinner 2 Male
In [12]:
print(tips_sub_miss.dtypes)
total_bill float64 tip float64 sex category smoker category day category time category size int64 sex_str object dtype: object
In [13]:
tips['sex'] = tips['sex'].astype('str')
print(tips.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 244 entries, 0 to 243 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 total_bill 244 non-null float64 1 tip 244 non-null float64 2 sex 244 non-null object 3 smoker 244 non-null category 4 day 244 non-null category 5 time 244 non-null category 6 size 244 non-null int64 7 sex_str 244 non-null object dtypes: category(3), float64(2), int64(1), object(2) memory usage: 10.8+ KB None
In [14]:
tips['sex'] = tips['sex'].astype('category')
print(tips.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 244 entries, 0 to 243 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 total_bill 244 non-null float64 1 tip 244 non-null float64 2 sex 244 non-null category 3 smoker 244 non-null category 4 day 244 non-null category 5 time 244 non-null category 6 size 244 non-null int64 7 sex_str 244 non-null object dtypes: category(4), float64(2), int64(1), object(1) memory usage: 9.3+ KB None
In [ ]:
'도서 > 프로그래밍' 카테고리의 다른 글
[01] 쉽게 배우는 JSP 웹 프로그래밍 (1) | 2024.01.13 |
---|---|
[09][完] Do it! 데이터 분석을 위한 판다스 입문 (0) | 2024.01.09 |
[07] Do it! 데이터 분석을 위한 판다스 입문 (0) | 2024.01.08 |
[06] Do it! 데이터 분석을 위한 판다스 입문 (0) | 2024.01.07 |
[05] Do it! 데이터 분석을 위한 판다스 입문 (0) | 2024.01.06 |