09 결측값 알아보기¶

판다스에서는 NULL 을 NaN 으로 표기

In [1]:

import pandas as pd

In [2]:

from numpy import NaN, NAN, nan

print(NaN == True)
print(NaN == 0)
print(NaN == '')
print(NaN == NaN)
print(NaN == NAN)
print(NaN == nan)
print(nan== NAN)

False
False
False
False
False
False
False

In [3]:

print(pd.isnull(NaN))
print(pd.isnull(NAN))
print(pd.isnull(nan))

True
True
True

In [4]:

print(pd.notnull(NaN))
print(pd.notnull(42))
print(pd.notnull('missing'))

False
True
True

In [5]:

visited_file = '../data/survey_visited.csv'
print(pd.read_csv(visited_file))

   ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3         NaN
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22

In [6]:

print(pd.read_csv(visited_file, keep_default_na=False))

   ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3            
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22

In [7]:

print(pd.read_csv(visited_file, na_values=[""], keep_default_na=False))

   ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3         NaN
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22

In [8]:

visited = pd.read_csv('../data/survey_visited.csv')
survey = pd.read_csv('../data/survey_survey.csv')
print(visited)

   ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3         NaN
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22

In [9]:

print(survey)

    taken person quant  reading
0     619   dyer   rad     9.82
1     619   dyer   sal     0.13
2     622   dyer   rad     7.80
3     622   dyer   sal     0.09
4     734     pb   rad     8.41
5     734   lake   sal     0.05
6     734     pb  temp   -21.50
7     735     pb   rad     7.22
8     735    NaN   sal     0.06
9     735    NaN  temp   -26.00
10    751     pb   rad     4.35
11    751     pb  temp   -18.50
12    751   lake   sal     0.10
13    752   lake   rad     2.19
14    752   lake   sal     0.09
15    752   lake  temp   -16.00
16    752    roe   sal    41.60
17    837   lake   rad     1.46
18    837   lake   sal     0.21
19    837    roe   sal    22.50
20    844    roe   rad    11.25

In [10]:

vs = visited.merge(survey, left_on='ident', right_on='taken')
print(vs)

    ident   site       dated  taken person quant  reading
0     619   DR-1  1927-02-08    619   dyer   rad     9.82
1     619   DR-1  1927-02-08    619   dyer   sal     0.13
2     622   DR-1  1927-02-10    622   dyer   rad     7.80
3     622   DR-1  1927-02-10    622   dyer   sal     0.09
4     734   DR-3  1939-01-07    734     pb   rad     8.41
5     734   DR-3  1939-01-07    734   lake   sal     0.05
6     734   DR-3  1939-01-07    734     pb  temp   -21.50
7     735   DR-3  1930-01-12    735     pb   rad     7.22
8     735   DR-3  1930-01-12    735    NaN   sal     0.06
9     735   DR-3  1930-01-12    735    NaN  temp   -26.00
10    751   DR-3  1930-02-26    751     pb   rad     4.35
11    751   DR-3  1930-02-26    751     pb  temp   -18.50
12    751   DR-3  1930-02-26    751   lake   sal     0.10
13    752   DR-3         NaN    752   lake   rad     2.19
14    752   DR-3         NaN    752   lake   sal     0.09
15    752   DR-3         NaN    752   lake  temp   -16.00
16    752   DR-3         NaN    752    roe   sal    41.60
17    837  MSK-4  1932-01-14    837   lake   rad     1.46
18    837  MSK-4  1932-01-14    837   lake   sal     0.21
19    837  MSK-4  1932-01-14    837    roe   sal    22.50
20    844   DR-1  1932-03-22    844    roe   rad    11.25

In [11]:

num_legs = pd.Series({'goat': 4, 'amoeba': nan})
print(num_legs)

goat      4.0
amoeba    NaN
dtype: float64

In [13]:

scientists = pd.DataFrame(
    {
        "Name": ["Rosaline Franklin", "William Gosset"],
        "Occupation": ["Chemist","Statistician"],
        "Born": ["1990-07-25", "1876-06-13"],
        "Died": ["1958-04-16", "1937-10-16"],
        "missing": [NaN, nan],
    }
)
print(scientists)

                Name    Occupation        Born        Died  missing
0  Rosaline Franklin       Chemist  1990-07-25  1958-04-16      NaN
1     William Gosset  Statistician  1876-06-13  1937-10-16      NaN

In [14]:

print(scientists.dtypes)

Name           object
Occupation     object
Born           object
Died           object
missing       float64
dtype: object

In [15]:

gapminder = pd.read_csv('../data/gapminder.tsv', sep='\t')

life_exp = gapminder.groupby('year')['lifeExp'].mean()
print(life_exp)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [17]:

y2000 = life_exp[life_exp.index > 2000]
print(y2000)

year
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [18]:

print(y2000.reindex(range(2000, 2010)))

year
2000          NaN
2001          NaN
2002    65.694923
2003          NaN
2004          NaN
2005          NaN
2006          NaN
2007    67.007423
2008          NaN
2009          NaN
Name: lifeExp, dtype: float64

In [19]:

ebola = pd.read_csv('../data/country_timeseries.csv')

print(ebola.count())

Date                   122
Day                    122
Cases_Guinea            93
Cases_Liberia           83
Cases_SierraLeone       87
Cases_Nigeria           38
Cases_Senegal           25
Cases_UnitedStates      18
Cases_Spain             16
Cases_Mali              12
Deaths_Guinea           92
Deaths_Liberia          81
Deaths_SierraLeone      87
Deaths_Nigeria          38
Deaths_Senegal          22
Deaths_UnitedStates     18
Deaths_Spain            16
Deaths_Mali             12
dtype: int64

In [20]:

num_rows = ebola.shape[0]
num_missing = num_rows - ebola.count()
print(num_missing)

Date                     0
Day                      0
Cases_Guinea            29
Cases_Liberia           39
Cases_SierraLeone       35
Cases_Nigeria           84
Cases_Senegal           97
Cases_UnitedStates     104
Cases_Spain            106
Cases_Mali             110
Deaths_Guinea           30
Deaths_Liberia          41
Deaths_SierraLeone      35
Deaths_Nigeria          84
Deaths_Senegal         100
Deaths_UnitedStates    104
Deaths_Spain           106
Deaths_Mali            110
dtype: int64

In [21]:

import numpy as np

print(np.count_nonzero(ebola.isnull()))

In [22]:

print(np.count_nonzero(ebola['Cases_Guinea'].isnull()))

In [23]:

cnts = ebola.Cases_Guinea.value_counts(dropna=False)
print(cnts)

Cases_Guinea
NaN       29
86.0       3
495.0      2
112.0      2
390.0      2
          ..
1199.0     1
1298.0     1
1350.0     1
1472.0     1
49.0       1
Name: count, Length: 89, dtype: int64

In [24]:

print(cnts.loc[pd.isnull(cnts.index)])

Cases_Guinea
NaN    29
Name: count, dtype: int64

In [25]:

print(ebola.Cases_Guinea.isnull().sum())

In [26]:

print(ebola.fillna(0).iloc[:, 0:5])

           Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0      1/5/2015  289        2776.0            0.0            10030.0
1      1/4/2015  288        2775.0            0.0             9780.0
2      1/3/2015  287        2769.0         8166.0             9722.0
3      1/2/2015  286           0.0         8157.0                0.0
4    12/31/2014  284        2730.0         8115.0             9633.0
..          ...  ...           ...            ...                ...
117   3/27/2014    5         103.0            8.0                6.0
118   3/26/2014    4          86.0            0.0                0.0
119   3/25/2014    3          86.0            0.0                0.0
120   3/24/2014    2          86.0            0.0                0.0
121   3/22/2014    0          49.0            0.0                0.0

[122 rows x 5 columns]

In [27]:

print(ebola.fillna(method='ffill').iloc[:, 0:5])

           Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0      1/5/2015  289        2776.0            NaN            10030.0
1      1/4/2015  288        2775.0            NaN             9780.0
2      1/3/2015  287        2769.0         8166.0             9722.0
3      1/2/2015  286        2769.0         8157.0             9722.0
4    12/31/2014  284        2730.0         8115.0             9633.0
..          ...  ...           ...            ...                ...
117   3/27/2014    5         103.0            8.0                6.0
118   3/26/2014    4          86.0            8.0                6.0
119   3/25/2014    3          86.0            8.0                6.0
120   3/24/2014    2          86.0            8.0                6.0
121   3/22/2014    0          49.0            8.0                6.0

[122 rows x 5 columns]

In [28]:

print(ebola.fillna(method='bfill').iloc[:, 0:5])

           Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0      1/5/2015  289        2776.0         8166.0            10030.0
1      1/4/2015  288        2775.0         8166.0             9780.0
2      1/3/2015  287        2769.0         8166.0             9722.0
3      1/2/2015  286        2730.0         8157.0             9633.0
4    12/31/2014  284        2730.0         8115.0             9633.0
..          ...  ...           ...            ...                ...
117   3/27/2014    5         103.0            8.0                6.0
118   3/26/2014    4          86.0            NaN                NaN
119   3/25/2014    3          86.0            NaN                NaN
120   3/24/2014    2          86.0            NaN                NaN
121   3/22/2014    0          49.0            NaN                NaN

[122 rows x 5 columns]

In [29]:

print(ebola.interpolate().iloc[:, 0:5])

           Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0      1/5/2015  289        2776.0            NaN            10030.0
1      1/4/2015  288        2775.0            NaN             9780.0
2      1/3/2015  287        2769.0         8166.0             9722.0
3      1/2/2015  286        2749.5         8157.0             9677.5
4    12/31/2014  284        2730.0         8115.0             9633.0
..          ...  ...           ...            ...                ...
117   3/27/2014    5         103.0            8.0                6.0
118   3/26/2014    4          86.0            8.0                6.0
119   3/25/2014    3          86.0            8.0                6.0
120   3/24/2014    2          86.0            8.0                6.0
121   3/22/2014    0          49.0            8.0                6.0

[122 rows x 5 columns]

In [30]:

print(ebola.shape)

(122, 18)

In [34]:

ebola_dropna = ebola.dropna()
print(ebola_dropna.shape)

(1, 18)

In [35]:

ebola["Cases_multiple"] = (
    ebola["Cases_Guinea"]
    + ebola["Cases_Liberia"]
    + ebola["Cases_SierraLeone"]
)

In [36]:

ebola_subset = ebola.loc[:, ["Cases_Guinea", "Cases_Liberia", "Cases_SierraLeone", "Cases_multiple"]]

print(ebola_subset.head(n=10))

   Cases_Guinea  Cases_Liberia  Cases_SierraLeone  Cases_multiple
0        2776.0            NaN            10030.0             NaN
1        2775.0            NaN             9780.0             NaN
2        2769.0         8166.0             9722.0         20657.0
3           NaN         8157.0                NaN             NaN
4        2730.0         8115.0             9633.0         20478.0
5        2706.0         8018.0             9446.0         20170.0
6        2695.0            NaN             9409.0             NaN
7        2630.0         7977.0             9203.0         19810.0
8        2597.0            NaN             9004.0             NaN
9        2571.0         7862.0             8939.0         19372.0

In [37]:

print(ebola.Cases_Guinea.sum(skipna=True))

84729.0

In [38]:

print(ebola.Cases_Guinea.sum(skipna=False))

nan

In [39]:

scientists.loc[1, "Name"] = pd.NA
scientists.loc[1, "Age"] = pd.NA
print(scientists)

                Name    Occupation        Born        Died  missing  Age
0  Rosaline Franklin       Chemist  1990-07-25  1958-04-16      NaN  NaN
1               <NA>  Statistician  1876-06-13  1937-10-16      NaN  NaN

In [40]:

print(scientists.dtypes)

Name           object
Occupation     object
Born           object
Died           object
missing       float64
Age           float64
dtype: object

10장 자료형 더 알아보기¶

In [1]:

import pandas as pd
import seaborn as sns

tips = sns.load_dataset("tips")

In [2]:

print(tips.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [3]:

tips['sex_str'] = tips['sex'].astype('str')
print(tips.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

In [4]:

tips['total_bill'] = tips['total_bill'].astype('str')
print(tips.dtypes)

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

In [5]:

tips['total_bill'] = tips['total_bill'].astype('float')
print(tips.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

In [6]:

tips_sub_miss = tips.head(10).copy()
tips_sub_miss.loc[[1,3,5,7], 'total_bill'] = 'missing'

print(tips_sub_miss)

  total_bill   tip     sex smoker  day    time  size sex_str
0      16.99  1.01  Female     No  Sun  Dinner     2  Female
1    missing  1.66    Male     No  Sun  Dinner     3    Male
2      21.01  3.50    Male     No  Sun  Dinner     3    Male
3    missing  3.31    Male     No  Sun  Dinner     2    Male
4      24.59  3.61  Female     No  Sun  Dinner     4  Female
5    missing  4.71    Male     No  Sun  Dinner     4    Male
6       8.77  2.00    Male     No  Sun  Dinner     2    Male
7    missing  3.12    Male     No  Sun  Dinner     4    Male
8      15.04  1.96    Male     No  Sun  Dinner     2    Male
9      14.78  3.23    Male     No  Sun  Dinner     2    Male

In [7]:

print(tips_sub_miss.dtypes)

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

In [8]:

pd.to_numeric(tips_sub_miss['total_bill'])

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
File ~\anaconda3\Lib\site-packages\pandas\_libs\lib.pyx:2280, in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string "missing"

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
Cell In[8], line 1
----> 1 pd.to_numeric(tips_sub_miss['total_bill'])

File ~\anaconda3\Lib\site-packages\pandas\core\tools\numeric.py:217, in to_numeric(arg, errors, downcast, dtype_backend)
    215 coerce_numeric = errors not in ("ignore", "raise")
    216 try:
--> 217     values, new_mask = lib.maybe_convert_numeric(  # type: ignore[call-overload]  # noqa
    218         values,
    219         set(),
    220         coerce_numeric=coerce_numeric,
    221         convert_to_masked_nullable=dtype_backend is not lib.no_default
    222         or isinstance(values_dtype, StringDtype),
    223     )
    224 except (ValueError, TypeError):
    225     if errors == "raise":

File ~\anaconda3\Lib\site-packages\pandas\_libs\lib.pyx:2322, in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string "missing" at position 1

In [9]:

tips_sub_miss['total_bill'] = pd.to_numeric(tips_sub_miss['total_bill'], errors='ignore')
print(tips_sub_miss)

  total_bill   tip     sex smoker  day    time  size sex_str
0      16.99  1.01  Female     No  Sun  Dinner     2  Female
1    missing  1.66    Male     No  Sun  Dinner     3    Male
2      21.01  3.50    Male     No  Sun  Dinner     3    Male
3    missing  3.31    Male     No  Sun  Dinner     2    Male
4      24.59  3.61  Female     No  Sun  Dinner     4  Female
5    missing  4.71    Male     No  Sun  Dinner     4    Male
6       8.77  2.00    Male     No  Sun  Dinner     2    Male
7    missing  3.12    Male     No  Sun  Dinner     4    Male
8      15.04  1.96    Male     No  Sun  Dinner     2    Male
9      14.78  3.23    Male     No  Sun  Dinner     2    Male

In [10]:

print(tips_sub_miss.dtypes)

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

In [11]:

tips_sub_miss['total_bill'] = pd.to_numeric(tips_sub_miss['total_bill'], errors='coerce')
print(tips_sub_miss)

   total_bill   tip     sex smoker  day    time  size sex_str
0       16.99  1.01  Female     No  Sun  Dinner     2  Female
1         NaN  1.66    Male     No  Sun  Dinner     3    Male
2       21.01  3.50    Male     No  Sun  Dinner     3    Male
3         NaN  3.31    Male     No  Sun  Dinner     2    Male
4       24.59  3.61  Female     No  Sun  Dinner     4  Female
5         NaN  4.71    Male     No  Sun  Dinner     4    Male
6        8.77  2.00    Male     No  Sun  Dinner     2    Male
7         NaN  3.12    Male     No  Sun  Dinner     4    Male
8       15.04  1.96    Male     No  Sun  Dinner     2    Male
9       14.78  3.23    Male     No  Sun  Dinner     2    Male

In [12]:

print(tips_sub_miss.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

In [13]:

tips['sex'] = tips['sex'].astype('str')
print(tips.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    object  
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
 7   sex_str     244 non-null    object  
dtypes: category(3), float64(2), int64(1), object(2)
memory usage: 10.8+ KB
None

In [14]:

tips['sex'] = tips['sex'].astype('category')
print(tips.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
 7   sex_str     244 non-null    object  
dtypes: category(4), float64(2), int64(1), object(1)
memory usage: 9.3+ KB
None

In [ ]:

[01] 쉽게 배우는 JSP 웹 프로그래밍 (1)	2024.01.13
[09][完] Do it! 데이터 분석을 위한 판다스 입문 (0)	2024.01.09
[07] Do it! 데이터 분석을 위한 판다스 입문 (0)	2024.01.08
[06] Do it! 데이터 분석을 위한 판다스 입문 (0)	2024.01.07
[05] Do it! 데이터 분석을 위한 판다스 입문 (0)	2024.01.06

늦었다고 안 하면 더 늦어요

[08] Do it! 데이터 분석을 위한 판다스 입문

8일차

09 결측값 알아보기¶

10장 자료형 더 알아보기¶

'도서 > 프로그래밍' 카테고리의 다른 글

티스토리툴바

[08] Do it! 데이터 분석을 위한 판다스 입문

8일차

09 결측값 알아보기¶

10장 자료형 더 알아보기¶

'도서 > 프로그래밍' 카테고리의 다른 글

관련글

티스토리툴바