
2일차
03 판다스 자료구조 살펴보기
In [1]:
import pandas as pd
s = pd.Series(['banana', 41])
s
Out[1]:
0 banana 1 41 dtype: object
In [3]:
s2 = pd.Series(data=['Wes McKinney', 'Creator of Pandas'], index=['Person', 'Who'])
s2
Out[3]:
Person Wes McKinney Who Creator of Pandas dtype: object
In [4]:
scientists = pd.DataFrame({
'Name': ['<NAME>', '<NAME>', '<NAME>'],
'Age': [25, 25, 25],
'Gender': ['M', 'F', 'M']
})
scientists
Out[4]:
Name | Age | Gender | |
---|---|---|---|
0 | <NAME> | 25 | M |
1 | <NAME> | 25 | F |
2 | <NAME> | 25 | M |
In [6]:
scientists = pd.DataFrame(data={
'Name': ['<NAME>', '<NAME>', '<NAME>'],
'Age': [25, 25, 25],
'Gender': ['M', 'F', 'M']
}, index=['<NAME1>', '<NAME2>', '<NAME3>'], columns=['Name', 'Age', 'Gender'])
print(scientists)
Name Age Gender <NAME1> <NAME> 25 M <NAME2> <NAME> 25 F <NAME3> <NAME> 25 M
In [7]:
first_row = scientists.loc['<NAME1>']
print(type(first_row))
<class 'pandas.core.series.Series'>
In [8]:
first_row
Out[8]:
Name <NAME> Age 25 Gender M Name: <NAME1>, dtype: object
In [9]:
first_row.index
Out[9]:
Index(['Name', 'Age', 'Gender'], dtype='object')
In [10]:
first_row.values
Out[10]:
array(['<NAME>', 25, 'M'], dtype=object)
In [11]:
print(first_row.T)
print(first_row.shape)
print(first_row.size)
Name <NAME> Age 25 Gender M Name: <NAME1>, dtype: object (3,) 3
In [12]:
first_row.keys()
Out[12]:
Index(['Name', 'Age', 'Gender'], dtype='object')
In [13]:
first_row.index[0]
Out[13]:
'Name'
In [15]:
ages = scientists['Age']
ages
Out[15]:
<NAME1> 25 <NAME2> 25 <NAME3> 25 Name: Age, dtype: int64
In [16]:
ages.mean()
Out[16]:
25.0
In [17]:
ages.min()
Out[17]:
25
In [18]:
ages.max()
Out[18]:
25
In [19]:
ages.std()
Out[19]:
0.0
In [21]:
scientists = pd.read_csv('../data/scientists.csv')
print(scientists)
Name Born Died Age Occupation 0 Rosaline Franklin 1920-07-25 1958-04-16 37 Chemist 1 William Gosset 1876-06-13 1937-10-16 61 Statistician 2 Florence Nightingale 1820-05-12 1910-08-13 90 Nurse 3 Marie Curie 1867-11-07 1934-07-04 66 Chemist 4 Rachel Carson 1907-05-27 1964-04-14 56 Biologist 5 John Snow 1813-03-15 1858-06-16 45 Physician 6 Alan Turing 1912-06-23 1954-06-07 41 Computer Scientist 7 Johann Gauss 1777-04-30 1855-02-23 77 Mathematician
In [23]:
scientists.shape
Out[23]:
(8, 5)
In [24]:
ages = scientists['Age']
print(ages)
0 37 1 61 2 90 3 66 4 56 5 45 6 41 7 77 Name: Age, dtype: int64
In [25]:
print(ages.describe())
count 8.000000 mean 59.125000 std 18.325918 min 37.000000 25% 44.000000 50% 58.500000 75% 68.750000 max 90.000000 Name: Age, dtype: float64
In [26]:
print(ages[ages > ages.mean()])
1 61 2 90 3 66 7 77 Name: Age, dtype: int64
In [27]:
print(ages > ages.mean())
0 False 1 True 2 True 3 True 4 False 5 False 6 False 7 True Name: Age, dtype: bool
In [28]:
print(type(ages > ages.mean()))
<class 'pandas.core.series.Series'>
In [29]:
print(ages + ages)
0 74 1 122 2 180 3 132 4 112 5 90 6 82 7 154 Name: Age, dtype: int64
In [30]:
print(ages * ages)
0 1369 1 3721 2 8100 3 4356 4 3136 5 2025 6 1681 7 5929 Name: Age, dtype: int64
In [33]:
print(ages + 100)
0 137 1 161 2 190 3 166 4 156 5 145 6 141 7 177 Name: Age, dtype: int64
길이가 서로 다른 벡터 연산하기¶
길이가 서로 다른 벡터를 연산할 때는 벡터의 type()에 따라 결과가 달라집니다. 길이가 서로 다른 벡터를 연산할 때는 인덱스가 같은 요소끼리 연산을 수행합니다. 결과 벡터에서 나머지는 결측값으로 채우고 숫자가 아님을 나타내는 NaN(Not a Number)으로 표시합니다.
In [34]:
print(ages + pd.Series([1, 100]))
0 38.0 1 161.0 2 NaN 3 NaN 4 NaN 5 NaN 6 NaN 7 NaN dtype: float64
In [35]:
import numpy as np
print(ages + np.array([1, 100]))
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[35], line 3 1 import numpy as np ----> 3 print(ages + np.array([1, 100])) File ~\anaconda3\Lib\site-packages\pandas\core\ops\common.py:81, in _unpack_zerodim_and_defer.<locals>.new_method(self, other) 77 return NotImplemented 79 other = item_from_zerodim(other) ---> 81 return method(self, other) File ~\anaconda3\Lib\site-packages\pandas\core\arraylike.py:186, in OpsMixin.__add__(self, other) 98 @unpack_zerodim_and_defer("__add__") 99 def __add__(self, other): 100 """ 101 Get Addition of DataFrame and other, column-wise. 102 (...) 184 moose 3.0 NaN 185 """ --> 186 return self._arith_method(other, operator.add) File ~\anaconda3\Lib\site-packages\pandas\core\series.py:6112, in Series._arith_method(self, other, op) 6110 def _arith_method(self, other, op): 6111 self, other = ops.align_method_SERIES(self, other) -> 6112 return base.IndexOpsMixin._arith_method(self, other, op) File ~\anaconda3\Lib\site-packages\pandas\core\base.py:1348, in IndexOpsMixin._arith_method(self, other, op) 1345 rvalues = ensure_wrapped_if_datetimelike(rvalues) 1347 with np.errstate(all="ignore"): -> 1348 result = ops.arithmetic_op(lvalues, rvalues, op) 1350 return self._construct_result(result, name=res_name) File ~\anaconda3\Lib\site-packages\pandas\core\ops\array_ops.py:232, in arithmetic_op(left, right, op) 228 _bool_arith_check(op, left, right) 230 # error: Argument 1 to "_na_arithmetic_op" has incompatible type 231 # "Union[ExtensionArray, ndarray[Any, Any]]"; expected "ndarray[Any, Any]" --> 232 res_values = _na_arithmetic_op(left, right, op) # type: ignore[arg-type] 234 return res_values File ~\anaconda3\Lib\site-packages\pandas\core\ops\array_ops.py:171, in _na_arithmetic_op(left, right, op, is_cmp) 168 func = partial(expressions.evaluate, op) 170 try: --> 171 result = func(left, right) 172 except TypeError: 173 if not is_cmp and (is_object_dtype(left.dtype) or is_object_dtype(right)): 174 # For object dtype, fallback to a masked operation (only operating 175 # on the non-missing values) 176 # Don't do this for comparisons, as that will handle complex numbers 177 # incorrectly, see GH#32047 File ~\anaconda3\Lib\site-packages\pandas\core\computation\expressions.py:239, in evaluate(op, a, b, use_numexpr) 236 if op_str is not None: 237 if use_numexpr: 238 # error: "None" not callable --> 239 return _evaluate(op, op_str, a, b) # type: ignore[misc] 240 return _evaluate_standard(op, op_str, a, b) File ~\anaconda3\Lib\site-packages\pandas\core\computation\expressions.py:128, in _evaluate_numexpr(op, op_str, a, b) 125 _store_test_result(result is not None) 127 if result is None: --> 128 result = _evaluate_standard(op, op_str, a, b) 130 return result File ~\anaconda3\Lib\site-packages\pandas\core\computation\expressions.py:70, in _evaluate_standard(op, op_str, a, b) 68 if _TEST_MODE: 69 _store_test_result(False) ---> 70 return op(a, b) ValueError: operands could not be broadcast together with shapes (8,) (2,)
In [37]:
rev_ages = ages.sort_index(ascending=False)
print(rev_ages)
7 77 6 41 5 45 4 56 3 66 2 90 1 61 0 37 Name: Age, dtype: int64
In [38]:
print(ages * 2)
0 74 1 122 2 180 3 132 4 112 5 90 6 82 7 154 Name: Age, dtype: int64
In [39]:
print(ages + rev_ages)
0 74 1 122 2 180 3 132 4 112 5 90 6 82 7 154 Name: Age, dtype: int64
정렬을 해서 순서를 바꿔도 더할 때 인덱스로 더하기 때문에 결과가 * 2 한 것과 동일하게 나옴
03-3 데이터프레임 다루기¶
In [40]:
scientists.index
Out[40]:
RangeIndex(start=0, stop=8, step=1)
In [41]:
scientists.columns
Out[41]:
Index(['Name', 'Born', 'Died', 'Age', 'Occupation'], dtype='object')
In [42]:
scientists.values
Out[42]:
array([['Rosaline Franklin', '1920-07-25', '1958-04-16', 37, 'Chemist'], ['William Gosset', '1876-06-13', '1937-10-16', 61, 'Statistician'], ['Florence Nightingale', '1820-05-12', '1910-08-13', 90, 'Nurse'], ['Marie Curie', '1867-11-07', '1934-07-04', 66, 'Chemist'], ['Rachel Carson', '1907-05-27', '1964-04-14', 56, 'Biologist'], ['John Snow', '1813-03-15', '1858-06-16', 45, 'Physician'], ['Alan Turing', '1912-06-23', '1954-06-07', 41, 'Computer Scientist'], ['Johann Gauss', '1777-04-30', '1855-02-23', 77, 'Mathematician']], dtype=object)
In [43]:
scientists.loc[scientists['Age'] > scientists['Age'].mean()]
Out[43]:
Name | Born | Died | Age | Occupation | |
---|---|---|---|---|---|
1 | William Gosset | 1876-06-13 | 1937-10-16 | 61 | Statistician |
2 | Florence Nightingale | 1820-05-12 | 1910-08-13 | 90 | Nurse |
3 | Marie Curie | 1867-11-07 | 1934-07-04 | 66 | Chemist |
7 | Johann Gauss | 1777-04-30 | 1855-02-23 | 77 | Mathematician |
In [44]:
first_half = scientists[:4]
second_half = scientists[4:]
print(first_half)
Name Born Died Age Occupation 0 Rosaline Franklin 1920-07-25 1958-04-16 37 Chemist 1 William Gosset 1876-06-13 1937-10-16 61 Statistician 2 Florence Nightingale 1820-05-12 1910-08-13 90 Nurse 3 Marie Curie 1867-11-07 1934-07-04 66 Chemist
In [45]:
print(second_half)
Name Born Died Age Occupation 4 Rachel Carson 1907-05-27 1964-04-14 56 Biologist 5 John Snow 1813-03-15 1858-06-16 45 Physician 6 Alan Turing 1912-06-23 1954-06-07 41 Computer Scientist 7 Johann Gauss 1777-04-30 1855-02-23 77 Mathematician
In [46]:
scientists * 2
Out[46]:
Name | Born | Died | Age | Occupation | |
---|---|---|---|---|---|
0 | Rosaline FranklinRosaline Franklin | 1920-07-251920-07-25 | 1958-04-161958-04-16 | 74 | ChemistChemist |
1 | William GossetWilliam Gosset | 1876-06-131876-06-13 | 1937-10-161937-10-16 | 122 | StatisticianStatistician |
2 | Florence NightingaleFlorence Nightingale | 1820-05-121820-05-12 | 1910-08-131910-08-13 | 180 | NurseNurse |
3 | Marie CurieMarie Curie | 1867-11-071867-11-07 | 1934-07-041934-07-04 | 132 | ChemistChemist |
4 | Rachel CarsonRachel Carson | 1907-05-271907-05-27 | 1964-04-141964-04-14 | 112 | BiologistBiologist |
5 | John SnowJohn Snow | 1813-03-151813-03-15 | 1858-06-161858-06-16 | 90 | PhysicianPhysician |
6 | Alan TuringAlan Turing | 1912-06-231912-06-23 | 1954-06-071954-06-07 | 82 | Computer ScientistComputer Scientist |
7 | Johann GaussJohann Gauss | 1777-04-301777-04-30 | 1855-02-231855-02-23 | 154 | MathematicianMathematician |
In [47]:
df1 = df2 = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df1_added = df1.add(df2)
print(df1_added)
0 1 2 0 2 4 6 1 8 10 12 2 14 16 18
In [48]:
scientists.dtypes
Out[48]:
Name object Born object Died object Age int64 Occupation object dtype: object
In [49]:
born_datetime = pd.to_datetime(scientists['Born'], format='%Y-%m-%d')
born_datetime
Out[49]:
0 1920-07-25 1 1876-06-13 2 1820-05-12 3 1867-11-07 4 1907-05-27 5 1813-03-15 6 1912-06-23 7 1777-04-30 Name: Born, dtype: datetime64[ns]
In [51]:
died_datetime = pd.to_datetime(scientists['Died'], format='%Y-%m-%d')
died_datetime
Out[51]:
0 1958-04-16 1 1937-10-16 2 1910-08-13 3 1934-07-04 4 1964-04-14 5 1858-06-16 6 1954-06-07 7 1855-02-23 Name: Died, dtype: datetime64[ns]
In [52]:
scientists['born_dt'], scientists['died_dt'] = (born_datetime, died_datetime)
In [53]:
print(scientists.head())
Name Born Died Age Occupation born_dt \ 0 Rosaline Franklin 1920-07-25 1958-04-16 37 Chemist 1920-07-25 1 William Gosset 1876-06-13 1937-10-16 61 Statistician 1876-06-13 2 Florence Nightingale 1820-05-12 1910-08-13 90 Nurse 1820-05-12 3 Marie Curie 1867-11-07 1934-07-04 66 Chemist 1867-11-07 4 Rachel Carson 1907-05-27 1964-04-14 56 Biologist 1907-05-27 died_dt 0 1958-04-16 1 1937-10-16 2 1910-08-13 3 1934-07-04 4 1964-04-14
In [54]:
scientists.shape
Out[54]:
(8, 7)
In [55]:
scientists.dtypes
Out[55]:
Name object Born object Died object Age int64 Occupation object born_dt datetime64[ns] died_dt datetime64[ns] dtype: object
In [56]:
scientists['Age'].sample(frac=1, random_state=42)
Out[56]:
1 61 5 45 0 37 7 77 2 90 4 56 3 66 6 41 Name: Age, dtype: int64
In [57]:
scientists['Age'] = scientists['Age'].sample(frac=1, random_state=42).values
print(scientists['Age'])
0 61 1 45 2 37 3 77 4 90 5 56 6 66 7 41 Name: Age, dtype: int64
In [58]:
scientists['age_days'] = (scientists['died_dt'] - scientists['born_dt'])
print(scientists)
Name Born Died Age Occupation \ 0 Rosaline Franklin 1920-07-25 1958-04-16 61 Chemist 1 William Gosset 1876-06-13 1937-10-16 45 Statistician 2 Florence Nightingale 1820-05-12 1910-08-13 37 Nurse 3 Marie Curie 1867-11-07 1934-07-04 77 Chemist 4 Rachel Carson 1907-05-27 1964-04-14 90 Biologist 5 John Snow 1813-03-15 1858-06-16 56 Physician 6 Alan Turing 1912-06-23 1954-06-07 66 Computer Scientist 7 Johann Gauss 1777-04-30 1855-02-23 41 Mathematician born_dt died_dt age_days 0 1920-07-25 1958-04-16 13779 days 1 1876-06-13 1937-10-16 22404 days 2 1820-05-12 1910-08-13 32964 days 3 1867-11-07 1934-07-04 24345 days 4 1907-05-27 1964-04-14 20777 days 5 1813-03-15 1858-06-16 16529 days 6 1912-06-23 1954-06-07 15324 days 7 1777-04-30 1855-02-23 28422 days
In [59]:
scientists['age_years'] = (scientists['age_days'].dt.days / 365).apply(np.floor)
scientists
Out[59]:
Name | Born | Died | Age | Occupation | born_dt | died_dt | age_days | age_years | |
---|---|---|---|---|---|---|---|---|---|
0 | Rosaline Franklin | 1920-07-25 | 1958-04-16 | 61 | Chemist | 1920-07-25 | 1958-04-16 | 13779 days | 37.0 |
1 | William Gosset | 1876-06-13 | 1937-10-16 | 45 | Statistician | 1876-06-13 | 1937-10-16 | 22404 days | 61.0 |
2 | Florence Nightingale | 1820-05-12 | 1910-08-13 | 37 | Nurse | 1820-05-12 | 1910-08-13 | 32964 days | 90.0 |
3 | Marie Curie | 1867-11-07 | 1934-07-04 | 77 | Chemist | 1867-11-07 | 1934-07-04 | 24345 days | 66.0 |
4 | Rachel Carson | 1907-05-27 | 1964-04-14 | 90 | Biologist | 1907-05-27 | 1964-04-14 | 20777 days | 56.0 |
5 | John Snow | 1813-03-15 | 1858-06-16 | 56 | Physician | 1813-03-15 | 1858-06-16 | 16529 days | 45.0 |
6 | Alan Turing | 1912-06-23 | 1954-06-07 | 66 | Computer Scientist | 1912-06-23 | 1954-06-07 | 15324 days | 41.0 |
7 | Johann Gauss | 1777-04-30 | 1855-02-23 | 41 | Mathematician | 1777-04-30 | 1855-02-23 | 28422 days | 77.0 |
매개변수 inplace를 사용할 때는 조심하세요! inplace를 True 로 지정하면 수정된 데이터프레임이 아닌 None을 반환하고 기존 데이터프레임을 바로 수정합니다. 바로 수정하면 데이터를 원치 않게 덮어쓸 수 있으므로 설정하지 않는 것이 좋습니다.
In [61]:
scientists = scientists.assign(age_days_assign=scientists['died_dt'] - scientists['born_dt'],
age_year_assign=(scientists['age_days'].dt.days / 365).apply(np.floor))
print(scientists)
Name Born Died Age Occupation \ 0 Rosaline Franklin 1920-07-25 1958-04-16 61 Chemist 1 William Gosset 1876-06-13 1937-10-16 45 Statistician 2 Florence Nightingale 1820-05-12 1910-08-13 37 Nurse 3 Marie Curie 1867-11-07 1934-07-04 77 Chemist 4 Rachel Carson 1907-05-27 1964-04-14 90 Biologist 5 John Snow 1813-03-15 1858-06-16 56 Physician 6 Alan Turing 1912-06-23 1954-06-07 66 Computer Scientist 7 Johann Gauss 1777-04-30 1855-02-23 41 Mathematician born_dt died_dt age_days age_years age_days_assign age_year_assign 0 1920-07-25 1958-04-16 13779 days 37.0 13779 days 37.0 1 1876-06-13 1937-10-16 22404 days 61.0 22404 days 61.0 2 1820-05-12 1910-08-13 32964 days 90.0 32964 days 90.0 3 1867-11-07 1934-07-04 24345 days 66.0 24345 days 66.0 4 1907-05-27 1964-04-14 20777 days 56.0 20777 days 56.0 5 1813-03-15 1858-06-16 16529 days 45.0 16529 days 45.0 6 1912-06-23 1954-06-07 15324 days 41.0 15324 days 41.0 7 1777-04-30 1855-02-23 28422 days 77.0 28422 days 77.0
In [64]:
scientists = scientists.assign(
age_days_assign=scientists['died_dt'] - scientists['born_dt'],
age_year_assign=lambda df_: (df_["age_days_assign"].dt.days / 365).apply(np.floor),
)
scientists
Out[64]:
Name | Born | Died | Age | Occupation | born_dt | died_dt | age_days | age_years | age_days_assign | age_year_assign | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Rosaline Franklin | 1920-07-25 | 1958-04-16 | 61 | Chemist | 1920-07-25 | 1958-04-16 | 13779 days | 37.0 | 13779 days | 37.0 |
1 | William Gosset | 1876-06-13 | 1937-10-16 | 45 | Statistician | 1876-06-13 | 1937-10-16 | 22404 days | 61.0 | 22404 days | 61.0 |
2 | Florence Nightingale | 1820-05-12 | 1910-08-13 | 37 | Nurse | 1820-05-12 | 1910-08-13 | 32964 days | 90.0 | 32964 days | 90.0 |
3 | Marie Curie | 1867-11-07 | 1934-07-04 | 77 | Chemist | 1867-11-07 | 1934-07-04 | 24345 days | 66.0 | 24345 days | 66.0 |
4 | Rachel Carson | 1907-05-27 | 1964-04-14 | 90 | Biologist | 1907-05-27 | 1964-04-14 | 20777 days | 56.0 | 20777 days | 56.0 |
5 | John Snow | 1813-03-15 | 1858-06-16 | 56 | Physician | 1813-03-15 | 1858-06-16 | 16529 days | 45.0 | 16529 days | 45.0 |
6 | Alan Turing | 1912-06-23 | 1954-06-07 | 66 | Computer Scientist | 1912-06-23 | 1954-06-07 | 15324 days | 41.0 | 15324 days | 41.0 |
7 | Johann Gauss | 1777-04-30 | 1855-02-23 | 41 | Mathematician | 1777-04-30 | 1855-02-23 | 28422 days | 77.0 | 28422 days | 77.0 |
In [65]:
print(scientists.columns)
Index(['Name', 'Born', 'Died', 'Age', 'Occupation', 'born_dt', 'died_dt', 'age_days', 'age_years', 'age_days_assign', 'age_year_assign'], dtype='object')
In [66]:
scientists_dropped = scientists.drop(['Age'], axis="columns")
scientists_dropped.columns
Out[66]:
Index(['Name', 'Born', 'Died', 'Occupation', 'born_dt', 'died_dt', 'age_days', 'age_years', 'age_days_assign', 'age_year_assign'], dtype='object')
In [67]:
names = scientists['Name']
names
Out[67]:
0 Rosaline Franklin 1 William Gosset 2 Florence Nightingale 3 Marie Curie 4 Rachel Carson 5 John Snow 6 Alan Turing 7 Johann Gauss Name: Name, dtype: object
In [68]:
names.to_pickle('../output/scientists_names_series.pickle')
In [70]:
scientists.to_pickle('../output/scientists_df.pickle')
In [71]:
series_pickle = pd.read_pickle('../output/scientists_names_series.pickle')
series_pickle
Out[71]:
0 Rosaline Franklin 1 William Gosset 2 Florence Nightingale 3 Marie Curie 4 Rachel Carson 5 John Snow 6 Alan Turing 7 Johann Gauss Name: Name, dtype: object
CSV 파일은 데이터를 쉼표로 구분한 파일, 가장 유연한 데이터 저장 형식, 탭으로 구분하면 TSV, 세미콜론을 사용하기도 함 텍스트 에디터로 열 수 있을만큼 범용성이 뛰어나지만 속도가 느리고 디스크 공간을 많이 차지
In [72]:
scientists.to_csv('../output/scientists_df_no_index.csv', index=False)
In [73]:
names_df = names.to_frame()
In [75]:
names_df.to_excel('../output/scientists_names_series_df.xlsx', sheet_name="scientists", engine='openpyxl')
In [76]:
scientists.to_feather('../output/scientists.feather')
In [77]:
sci_feather = pd.read_feather('../output/scientists.feather')
print(sci_feather)
Name Born Died Age Occupation \ 0 Rosaline Franklin 1920-07-25 1958-04-16 61 Chemist 1 William Gosset 1876-06-13 1937-10-16 45 Statistician 2 Florence Nightingale 1820-05-12 1910-08-13 37 Nurse 3 Marie Curie 1867-11-07 1934-07-04 77 Chemist 4 Rachel Carson 1907-05-27 1964-04-14 90 Biologist 5 John Snow 1813-03-15 1858-06-16 56 Physician 6 Alan Turing 1912-06-23 1954-06-07 66 Computer Scientist 7 Johann Gauss 1777-04-30 1855-02-23 41 Mathematician born_dt died_dt age_days age_years age_days_assign age_year_assign 0 1920-07-25 1958-04-16 13779 days 37.0 13779 days 37.0 1 1876-06-13 1937-10-16 22404 days 61.0 22404 days 61.0 2 1820-05-12 1910-08-13 32964 days 90.0 32964 days 90.0 3 1867-11-07 1934-07-04 24345 days 66.0 24345 days 66.0 4 1907-05-27 1964-04-14 20777 days 56.0 20777 days 56.0 5 1813-03-15 1858-06-16 16529 days 45.0 16529 days 45.0 6 1912-06-23 1954-06-07 15324 days 41.0 15324 days 41.0 7 1777-04-30 1855-02-23 28422 days 77.0 28422 days 77.0
Arrow 객체란?
feather 파일은 Apache Arrow 프로젝트의 일부, 데이터 프레임 객체의 유형을 변환할 필요 없이 여러 프로그래밍 언어에서 작동하는 데이터프레임 객체의 메모리 저장 형식을 개발하는 것
In [80]:
sci_sub = scientists.head(2)
sci_dict = sci_sub.to_dict()
import pprint
pprint.pprint(sci_dict)
{'Age': {0: 61, 1: 45}, 'Born': {0: '1920-07-25', 1: '1876-06-13'}, 'Died': {0: '1958-04-16', 1: '1937-10-16'}, 'Name': {0: 'Rosaline Franklin', 1: 'William Gosset'}, 'Occupation': {0: 'Chemist', 1: 'Statistician'}, 'age_days': {0: Timedelta('13779 days 00:00:00'), 1: Timedelta('22404 days 00:00:00')}, 'age_days_assign': {0: Timedelta('13779 days 00:00:00'), 1: Timedelta('22404 days 00:00:00')}, 'age_year_assign': {0: 37.0, 1: 61.0}, 'age_years': {0: 37.0, 1: 61.0}, 'born_dt': {0: Timestamp('1920-07-25 00:00:00'), 1: Timestamp('1876-06-13 00:00:00')}, 'died_dt': {0: Timestamp('1958-04-16 00:00:00'), 1: Timestamp('1937-10-16 00:00:00')}}
In [81]:
sci_dict_df = pd.DataFrame.from_dict(sci_dict)
print(sci_dict_df)
Name Born Died Age Occupation born_dt \ 0 Rosaline Franklin 1920-07-25 1958-04-16 61 Chemist 1920-07-25 1 William Gosset 1876-06-13 1937-10-16 45 Statistician 1876-06-13 died_dt age_days age_years age_days_assign age_year_assign 0 1958-04-16 13779 days 37.0 13779 days 37.0 1 1937-10-16 22404 days 61.0 22404 days 61.0
날짜와 시간을 처리할 때는 조심하세요!
날짜를 처리해야 한다면 object와 같은 일반적인 형식으로 변환하고 그 값을 다시 날짜로 변환해야 합니다.
In [82]:
sci_json = sci_sub.to_json(orient='records', indent=2, date_format="iso")
pprint.pprint(sci_json)
('[\n' ' {\n' ' "Name":"Rosaline Franklin",\n' ' "Born":"1920-07-25",\n' ' "Died":"1958-04-16",\n' ' "Age":61,\n' ' "Occupation":"Chemist",\n' ' "born_dt":"1920-07-25T00:00:00.000",\n' ' "died_dt":"1958-04-16T00:00:00.000",\n' ' "age_days":"P13779DT0H0M0S",\n' ' "age_years":37.0,\n' ' "age_days_assign":"P13779DT0H0M0S",\n' ' "age_year_assign":37.0\n' ' },\n' ' {\n' ' "Name":"William Gosset",\n' ' "Born":"1876-06-13",\n' ' "Died":"1937-10-16",\n' ' "Age":45,\n' ' "Occupation":"Statistician",\n' ' "born_dt":"1876-06-13T00:00:00.000",\n' ' "died_dt":"1937-10-16T00:00:00.000",\n' ' "age_days":"P22404DT0H0M0S",\n' ' "age_years":61.0,\n' ' "age_days_assign":"P22404DT0H0M0S",\n' ' "age_year_assign":61.0\n' ' }\n' ']')
In [83]:
sci_json_df = pd.read_json(sci_json)
sci_json_df
Out[83]:
Name | Born | Died | Age | Occupation | born_dt | died_dt | age_days | age_years | age_days_assign | age_year_assign | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Rosaline Franklin | 1920-07-25 | 1958-04-16 | 61 | Chemist | 1920-07-25T00:00:00.000 | 1958-04-16T00:00:00.000 | P13779DT0H0M0S | 37 | P13779DT0H0M0S | 37 |
1 | William Gosset | 1876-06-13 | 1937-10-16 | 45 | Statistician | 1876-06-13T00:00:00.000 | 1937-10-16T00:00:00.000 | P22404DT0H0M0S | 61 | P22404DT0H0M0S | 61 |
In [84]:
sci_json_df.dtypes
Out[84]:
Name object Born object Died object Age int64 Occupation object born_dt object died_dt object age_days object age_years int64 age_days_assign object age_year_assign int64 dtype: object
In [ ]:
In [87]:
sci_json_df['died_dt_json'] = pd.to_datetime(sci_json_df['died_dt'])
sci_json_df
Out[87]:
Name | Born | Died | Age | Occupation | born_dt | died_dt | age_days | age_years | age_days_assign | age_year_assign | died_dt_json | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Rosaline Franklin | 1920-07-25 | 1958-04-16 | 61 | Chemist | 1920-07-25T00:00:00.000 | 1958-04-16T00:00:00.000 | P13779DT0H0M0S | 37 | P13779DT0H0M0S | 37 | 1958-04-16 |
1 | William Gosset | 1876-06-13 | 1937-10-16 | 45 | Statistician | 1876-06-13T00:00:00.000 | 1937-10-16T00:00:00.000 | P22404DT0H0M0S | 61 | P22404DT0H0M0S | 61 | 1937-10-16 |
In [91]:
sci_json_df['died_dt_json'].dtypes
Out[91]:
dtype('<M8[ns]')
'도서 > 프로그래밍' 카테고리의 다른 글
[04] Do it! 데이터 분석을 위한 판다스 입문 (0) | 2024.01.05 |
---|---|
[03] Do it! 데이터 분석을 위한 판다스 입문 (0) | 2024.01.04 |
[01] Do it! 데이터 분석을 위한 판다스 입문 (2) | 2024.01.02 |
[07][完] 객체지향의 사실과 오해 - 함께 모으기 (2) | 2024.01.01 |
[06] 객체지향의 사실과 오해 - 객체지도 (2) | 2023.12.31 |