07 데이터 결합하고 분해하기¶

여러 데이터셋을 병합하여 하나의 표로 구성하고 분석하기도 합니다. 이렇게 하려면 한 곳에 변경이 생기더라도 모든 데이터에 올바르게 반영될 수 있도록 정규화하는 것이 좋습니다.

In [1]:

import pandas as pd

07-1 데이터 묶어 분석하기¶

In [2]:

df1 = pd.read_csv('../data/concat_1.csv')
df2 = pd.read_csv('../data/concat_2.csv')
df3 = pd.read_csv('../data/concat_3.csv')
print(df1)

    A   B   C   D
0  a0  b0  c0  d0
1  a1  b1  c1  d1
2  a2  b2  c2  d2
3  a3  b3  c3  d3

In [3]:

print(df2)

    A   B   C   D
0  a4  b4  c4  d4
1  a5  b5  c5  d5
2  a6  b6  c6  d6
3  a7  b7  c7  d7

In [4]:

print(df3)

     A    B    C    D
0   a8   b8   c8   d8
1   a9   b9   c9   d9
2  a10  b10  c10  d10
3  a11  b11  c11  d11

In [5]:

print(df1.index)

RangeIndex(start=0, stop=4, step=1)

In [6]:

print(df1.columns)

Index(['A', 'B', 'C', 'D'], dtype='object')

In [7]:

print(df1.values)

[['a0' 'b0' 'c0' 'd0']
 ['a1' 'b1' 'c1' 'd1']
 ['a2' 'b2' 'c2' 'd2']
 ['a3' 'b3' 'c3' 'd3']]

In [9]:

row_concat = pd.concat([df1, df2, df3])
print(row_concat)

     A    B    C    D
0   a0   b0   c0   d0
1   a1   b1   c1   d1
2   a2   b2   c2   d2
3   a3   b3   c3   d3
0   a4   b4   c4   d4
1   a5   b5   c5   d5
2   a6   b6   c6   d6
3   a7   b7   c7   d7
0   a8   b8   c8   d8
1   a9   b9   c9   d9
2  a10  b10  c10  d10
3  a11  b11  c11  d11

In [10]:

print(row_concat.iloc[3, :])

A    a3
B    b3
C    c3
D    d3
Name: 3, dtype: object

In [11]:

new_row_series = pd.Series(['n1', 'n2', 'n3', 'n4'])
print(new_row_series)

0    n1
1    n2
2    n3
3    n4
dtype: object

In [13]:

print(pd.concat([df1, new_row_series]))

     A    B    C    D    0
0   a0   b0   c0   d0  NaN
1   a1   b1   c1   d1  NaN
2   a2   b2   c2   d2  NaN
3   a3   b3   c3   d3  NaN
0  NaN  NaN  NaN  NaN   n1
1  NaN  NaN  NaN  NaN   n2
2  NaN  NaN  NaN  NaN   n3
3  NaN  NaN  NaN  NaN   n4

In [14]:

new_row_df = pd.DataFrame(
    data =[["n1", "n2", "n3", "n4"]],
    columns=["A", "B", "C", "D"]
)
print(new_row_df)

    A   B   C   D
0  n1  n2  n3  n4

In [15]:

row_concat_i = pd.concat([df1, df2, df3], ignore_index=True)
print(row_concat_i)

      A    B    C    D
0    a0   b0   c0   d0
1    a1   b1   c1   d1
2    a2   b2   c2   d2
3    a3   b3   c3   d3
4    a4   b4   c4   d4
5    a5   b5   c5   d5
6    a6   b6   c6   d6
7    a7   b7   c7   d7
8    a8   b8   c8   d8
9    a9   b9   c9   d9
10  a10  b10  c10  d10
11  a11  b11  c11  d11

In [16]:

col_concat = pd.concat([df1, df2, df3], axis="columns")
print(col_concat)

    A   B   C   D   A   B   C   D    A    B    C    D
0  a0  b0  c0  d0  a4  b4  c4  d4   a8   b8   c8   d8
1  a1  b1  c1  d1  a5  b5  c5  d5   a9   b9   c9   d9
2  a2  b2  c2  d2  a6  b6  c6  d6  a10  b10  c10  d10
3  a3  b3  c3  d3  a7  b7  c7  d7  a11  b11  c11  d11

In [17]:

print(col_concat['A'])

    A   A    A
0  a0  a4   a8
1  a1  a5   a9
2  a2  a6  a10
3  a3  a7  a11

In [18]:

col_concat['new_col_list'] = ['n1', 'n2', 'n3', 'n4']
print(col_concat)

    A   B   C   D   A   B   C   D    A    B    C    D new_col_list
0  a0  b0  c0  d0  a4  b4  c4  d4   a8   b8   c8   d8           n1
1  a1  b1  c1  d1  a5  b5  c5  d5   a9   b9   c9   d9           n2
2  a2  b2  c2  d2  a6  b6  c6  d6  a10  b10  c10  d10           n3
3  a3  b3  c3  d3  a7  b7  c7  d7  a11  b11  c11  d11           n4

In [19]:

col_concat['new_col_series'] = pd.Series(['n1', 'n2', 'n3', 'n4'])
print(col_concat)

    A   B   C   D   A   B   C   D    A    B    C    D new_col_list  \
0  a0  b0  c0  d0  a4  b4  c4  d4   a8   b8   c8   d8           n1   
1  a1  b1  c1  d1  a5  b5  c5  d5   a9   b9   c9   d9           n2   
2  a2  b2  c2  d2  a6  b6  c6  d6  a10  b10  c10  d10           n3   
3  a3  b3  c3  d3  a7  b7  c7  d7  a11  b11  c11  d11           n4   

  new_col_series  
0             n1  
1             n2  
2             n3  
3             n4

In [20]:

df1.columns = ['A', 'B', 'C', 'D']
df2.columns = ['E', 'F', 'G', 'H']
df3.columns = ['A', 'C', 'F', 'H']
print(df1)

    A   B   C   D
0  a0  b0  c0  d0
1  a1  b1  c1  d1
2  a2  b2  c2  d2
3  a3  b3  c3  d3

In [21]:

print(df2)

    E   F   G   H
0  a4  b4  c4  d4
1  a5  b5  c5  d5
2  a6  b6  c6  d6
3  a7  b7  c7  d7

In [22]:

print(df3)

     A    C    F    H
0   a8   b8   c8   d8
1   a9   b9   c9   d9
2  a10  b10  c10  d10
3  a11  b11  c11  d11

In [23]:

row_concat = pd.concat([df1, df2, df3])
print(row_concat)

     A    B    C    D    E    F    G    H
0   a0   b0   c0   d0  NaN  NaN  NaN  NaN
1   a1   b1   c1   d1  NaN  NaN  NaN  NaN
2   a2   b2   c2   d2  NaN  NaN  NaN  NaN
3   a3   b3   c3   d3  NaN  NaN  NaN  NaN
0  NaN  NaN  NaN  NaN   a4   b4   c4   d4
1  NaN  NaN  NaN  NaN   a5   b5   c5   d5
2  NaN  NaN  NaN  NaN   a6   b6   c6   d6
3  NaN  NaN  NaN  NaN   a7   b7   c7   d7
0   a8  NaN   b8  NaN  NaN   c8  NaN   d8
1   a9  NaN   b9  NaN  NaN   c9  NaN   d9
2  a10  NaN  b10  NaN  NaN  c10  NaN  d10
3  a11  NaN  b11  NaN  NaN  c11  NaN  d11

In [24]:

print(pd.concat([df1, df2, df3], join='inner'))

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]

In [27]:

print(pd.concat([df1, df3], ignore_index=False, join='inner'))

     A    C
0   a0   c0
1   a1   c1
2   a2   c2
3   a3   c3
0   a8   b8
1   a9   b9
2  a10  b10
3  a11  b11

In [28]:

df1.index = [0, 1, 2, 3]
df2.index = [4, 5, 6, 7]
df3.index = [0, 2, 5, 7]
print(df1)

    A   B   C   D
0  a0  b0  c0  d0
1  a1  b1  c1  d1
2  a2  b2  c2  d2
3  a3  b3  c3  d3

In [29]:

print(df2)

    E   F   G   H
4  a4  b4  c4  d4
5  a5  b5  c5  d5
6  a6  b6  c6  d6
7  a7  b7  c7  d7

In [30]:

print(df3)

     A    C    F    H
0   a8   b8   c8   d8
2   a9   b9   c9   d9
5  a10  b10  c10  d10
7  a11  b11  c11  d11

In [31]:

col_concat = pd.concat([df1, df2, df3], axis="columns")
print(col_concat)

     A    B    C    D    E    F    G    H    A    C    F    H
0   a0   b0   c0   d0  NaN  NaN  NaN  NaN   a8   b8   c8   d8
1   a1   b1   c1   d1  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN
2   a2   b2   c2   d2  NaN  NaN  NaN  NaN   a9   b9   c9   d9
3   a3   b3   c3   d3  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN
4  NaN  NaN  NaN  NaN   a4   b4   c4   d4  NaN  NaN  NaN  NaN
5  NaN  NaN  NaN  NaN   a5   b5   c5   d5  a10  b10  c10  d10
6  NaN  NaN  NaN  NaN   a6   b6   c6   d6  NaN  NaN  NaN  NaN
7  NaN  NaN  NaN  NaN   a7   b7   c7   d7  a11  b11  c11  d11

In [32]:

print(pd.concat([df1, df3], axis="columns", join="inner"))

    A   B   C   D   A   C   F   H
0  a0  b0  c0  d0  a8  b8  c8  d8
2  a2  b2  c2  d2  a9  b9  c9  d9

In [38]:

from pathlib import Path

billboard_data_files = (
    Path(".")
    .glob("../data/billboard_by_week/billboard-*.xlsx")
)

billboard_data_files = sorted(list(billboard_data_files))
print(billboard_data_files)

[WindowsPath('../data/billboard_by_week/billboard-01.xlsx'), WindowsPath('../data/billboard_by_week/billboard-02.xlsx'), WindowsPath('../data/billboard_by_week/billboard-03.xlsx'), WindowsPath('../data/billboard_by_week/billboard-04.xlsx'), WindowsPath('../data/billboard_by_week/billboard-05.xlsx'), WindowsPath('../data/billboard_by_week/billboard-06.xlsx'), WindowsPath('../data/billboard_by_week/billboard-07.xlsx'), WindowsPath('../data/billboard_by_week/billboard-08.xlsx'), WindowsPath('../data/billboard_by_week/billboard-09.xlsx'), WindowsPath('../data/billboard_by_week/billboard-10.xlsx'), WindowsPath('../data/billboard_by_week/billboard-11.xlsx'), WindowsPath('../data/billboard_by_week/billboard-12.xlsx'), WindowsPath('../data/billboard_by_week/billboard-13.xlsx'), WindowsPath('../data/billboard_by_week/billboard-14.xlsx'), WindowsPath('../data/billboard_by_week/billboard-15.xlsx'), WindowsPath('../data/billboard_by_week/billboard-16.xlsx'), WindowsPath('../data/billboard_by_week/billboard-17.xlsx'), WindowsPath('../data/billboard_by_week/billboard-18.xlsx'), WindowsPath('../data/billboard_by_week/billboard-19.xlsx'), WindowsPath('../data/billboard_by_week/billboard-20.xlsx'), WindowsPath('../data/billboard_by_week/billboard-21.xlsx'), WindowsPath('../data/billboard_by_week/billboard-22.xlsx'), WindowsPath('../data/billboard_by_week/billboard-23.xlsx'), WindowsPath('../data/billboard_by_week/billboard-24.xlsx'), WindowsPath('../data/billboard_by_week/billboard-25.xlsx'), WindowsPath('../data/billboard_by_week/billboard-26.xlsx'), WindowsPath('../data/billboard_by_week/billboard-27.xlsx'), WindowsPath('../data/billboard_by_week/billboard-28.xlsx'), WindowsPath('../data/billboard_by_week/billboard-29.xlsx'), WindowsPath('../data/billboard_by_week/billboard-30.xlsx'), WindowsPath('../data/billboard_by_week/billboard-31.xlsx'), WindowsPath('../data/billboard_by_week/billboard-32.xlsx'), WindowsPath('../data/billboard_by_week/billboard-33.xlsx'), WindowsPath('../data/billboard_by_week/billboard-34.xlsx'), WindowsPath('../data/billboard_by_week/billboard-35.xlsx'), WindowsPath('../data/billboard_by_week/billboard-36.xlsx'), WindowsPath('../data/billboard_by_week/billboard-37.xlsx'), WindowsPath('../data/billboard_by_week/billboard-38.xlsx'), WindowsPath('../data/billboard_by_week/billboard-39.xlsx'), WindowsPath('../data/billboard_by_week/billboard-40.xlsx'), WindowsPath('../data/billboard_by_week/billboard-41.xlsx'), WindowsPath('../data/billboard_by_week/billboard-42.xlsx'), WindowsPath('../data/billboard_by_week/billboard-43.xlsx'), WindowsPath('../data/billboard_by_week/billboard-44.xlsx'), WindowsPath('../data/billboard_by_week/billboard-45.xlsx'), WindowsPath('../data/billboard_by_week/billboard-46.xlsx'), WindowsPath('../data/billboard_by_week/billboard-47.xlsx'), WindowsPath('../data/billboard_by_week/billboard-48.xlsx'), WindowsPath('../data/billboard_by_week/billboard-49.xlsx'), WindowsPath('../data/billboard_by_week/billboard-50.xlsx'), WindowsPath('../data/billboard_by_week/billboard-51.xlsx'), WindowsPath('../data/billboard_by_week/billboard-52.xlsx'), WindowsPath('../data/billboard_by_week/billboard-53.xlsx'), WindowsPath('../data/billboard_by_week/billboard-54.xlsx'), WindowsPath('../data/billboard_by_week/billboard-55.xlsx'), WindowsPath('../data/billboard_by_week/billboard-56.xlsx'), WindowsPath('../data/billboard_by_week/billboard-57.xlsx'), WindowsPath('../data/billboard_by_week/billboard-58.xlsx'), WindowsPath('../data/billboard_by_week/billboard-59.xlsx'), WindowsPath('../data/billboard_by_week/billboard-60.xlsx'), WindowsPath('../data/billboard_by_week/billboard-61.xlsx'), WindowsPath('../data/billboard_by_week/billboard-62.xlsx'), WindowsPath('../data/billboard_by_week/billboard-63.xlsx'), WindowsPath('../data/billboard_by_week/billboard-64.xlsx'), WindowsPath('../data/billboard_by_week/billboard-65.xlsx'), WindowsPath('../data/billboard_by_week/billboard-66.xlsx'), WindowsPath('../data/billboard_by_week/billboard-67.xlsx'), WindowsPath('../data/billboard_by_week/billboard-68.xlsx'), WindowsPath('../data/billboard_by_week/billboard-69.xlsx'), WindowsPath('../data/billboard_by_week/billboard-70.xlsx'), WindowsPath('../data/billboard_by_week/billboard-71.xlsx'), WindowsPath('../data/billboard_by_week/billboard-72.xlsx'), WindowsPath('../data/billboard_by_week/billboard-73.xlsx'), WindowsPath('../data/billboard_by_week/billboard-74.xlsx'), WindowsPath('../data/billboard_by_week/billboard-75.xlsx'), WindowsPath('../data/billboard_by_week/billboard-76.xlsx')]

In [39]:

billboard_data_files = list(billboard_data_files)

In [41]:

billboard01 = pd.read_excel(billboard_data_files[0])
billboard02 = pd.read_excel(billboard_data_files[1])
billboard03 = pd.read_excel(billboard_data_files[2])

print(billboard01.head())

   year        artist                    track      time date.entered week  \
0  2000         2 Pac  Baby Don't Cry (Keep...  04:22:00   2000-02-26  wk1   
1  2000       2Ge+her  The Hardest Part Of ...  03:15:00   2000-09-02  wk1   
2  2000  3 Doors Down               Kryptonite  03:53:00   2000-04-08  wk1   
3  2000  3 Doors Down                    Loser  04:24:00   2000-10-21  wk1   
4  2000      504 Boyz            Wobble Wobble  03:35:00   2000-04-15  wk1   

   rating  
0      87  
1      91  
2      81  
3      76  
4      57

In [42]:

print(billboard01.shape)

(317, 7)

In [43]:

billboard = pd.concat([billboard01, billboard02, billboard03])
print(billboard.shape)

(951, 7)

In [44]:

from pathlib import Path

billboard_data_files = (
    Path(".")
    .glob("../data/billboard_by_week/billboard-*.xlsx")
)

list_billboard_df = []

for excel_filename in billboard_data_files:
    df = pd.read_excel(excel_filename)
    
    list_billboard_df.append(df)

print(len(list_billboard_df))

In [45]:

print(type(list_billboard_df[0]))

<class 'pandas.core.frame.DataFrame'>

In [46]:

print(list_billboard_df[0])

     year            artist                    track      time date.entered  \
0    2000             2 Pac  Baby Don't Cry (Keep...  04:22:00   2000-02-26   
1    2000           2Ge+her  The Hardest Part Of ...  03:15:00   2000-09-02   
2    2000      3 Doors Down               Kryptonite  03:53:00   2000-04-08   
3    2000      3 Doors Down                    Loser  04:24:00   2000-10-21   
4    2000          504 Boyz            Wobble Wobble  03:35:00   2000-04-15   
..    ...               ...                      ...       ...          ...   
312  2000       Yankee Grey     Another Nine Minutes  03:10:00   2000-04-29   
313  2000  Yearwood, Trisha          Real Live Woman  03:55:00   2000-04-01   
314  2000   Ying Yang Twins  Whistle While You Tw...  04:19:00   2000-03-18   
315  2000     Zombie Nation            Kernkraft 400  03:30:00   2000-09-02   
316  2000   matchbox twenty                     Bent  04:12:00   2000-04-29   

    week  rating  
0    wk1      87  
1    wk1      91  
2    wk1      81  
3    wk1      76  
4    wk1      57  
..   ...     ...  
312  wk1      86  
313  wk1      85  
314  wk1      95  
315  wk1      99  
316  wk1      60  

[317 rows x 7 columns]

In [47]:

billboard_loop_concat = pd.concat(list_billboard_df)
print(billboard_loop_concat.shape)

(24092, 7)

In [48]:

billboard_Data_files = (
    Path('.')
    .glob("../data/billboard_by_week/billboard-*.xlsx")
)

billboard_dfs = [pd.read_excel(data) for data in billboard_data_files]

In [49]:

print(type(billboard_dfs))

<class 'list'>

In [50]:

billboard_concat_comp = pd.concat(billboard_dfs)
print(billboard_concat_comp)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[50], line 1
----> 1 billboard_concat_comp = pd.concat(billboard_dfs)
      2 print(billboard_concat_comp)

File ~\anaconda3\Lib\site-packages\pandas\core\reshape\concat.py:372, in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
    369 elif copy and using_copy_on_write():
    370     copy = False
--> 372 op = _Concatenator(
    373     objs,
    374     axis=axis,
    375     ignore_index=ignore_index,
    376     join=join,
    377     keys=keys,
    378     levels=levels,
    379     names=names,
    380     verify_integrity=verify_integrity,
    381     copy=copy,
    382     sort=sort,
    383 )
    385 return op.get_result()

File ~\anaconda3\Lib\site-packages\pandas\core\reshape\concat.py:429, in _Concatenator.__init__(self, objs, axis, join, keys, levels, names, ignore_index, verify_integrity, copy, sort)
    426     objs = list(objs)
    428 if len(objs) == 0:
--> 429     raise ValueError("No objects to concatenate")
    431 if keys is None:
    432     objs = list(com.not_none(*objs))

ValueError: No objects to concatenate

In [51]:

person = pd.read_csv("../data/survey_person.csv")
site = pd.read_csv("../data/survey_site.csv")
survey = pd.read_csv("../data/survey_survey.csv")
visited = pd.read_csv("../data/survey_visited.csv")

print(person)

      ident   personal    family
0      dyer    William      Dyer
1        pb      Frank   Pabodie
2      lake   Anderson      Lake
3       roe  Valentina   Roerich
4  danforth      Frank  Danforth

In [52]:

print(site)

    name    lat    long
0   DR-1 -49.85 -128.57
1   DR-3 -47.15 -126.72
2  MSK-4 -48.87 -123.40

In [53]:

print(visited)

   ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3         NaN
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22

In [54]:

print(survey)

    taken person quant  reading
0     619   dyer   rad     9.82
1     619   dyer   sal     0.13
2     622   dyer   rad     7.80
3     622   dyer   sal     0.09
4     734     pb   rad     8.41
5     734   lake   sal     0.05
6     734     pb  temp   -21.50
7     735     pb   rad     7.22
8     735    NaN   sal     0.06
9     735    NaN  temp   -26.00
10    751     pb   rad     4.35
11    751     pb  temp   -18.50
12    751   lake   sal     0.10
13    752   lake   rad     2.19
14    752   lake   sal     0.09
15    752   lake  temp   -16.00
16    752    roe   sal    41.60
17    837   lake   rad     1.46
18    837   lake   sal     0.21
19    837    roe   sal    22.50
20    844    roe   rad    11.25

In [57]:

visited_subset = visited.loc[[0, 2, 6], :]
print(visited_subset)

   ident   site       dated
0    619   DR-1  1927-02-08
2    734   DR-3  1939-01-07
6    837  MSK-4  1932-01-14

In [59]:

print(visited_subset["site"].value_counts())

site
DR-1     1
DR-3     1
MSK-4    1
Name: count, dtype: int64

In [60]:

o2o_merge = site.merge(visited_subset, left_on="name", right_on="site")
print(o2o_merge)

    name    lat    long  ident   site       dated
0   DR-1 -49.85 -128.57    619   DR-1  1927-02-08
1   DR-3 -47.15 -126.72    734   DR-3  1939-01-07
2  MSK-4 -48.87 -123.40    837  MSK-4  1932-01-14

In [61]:

print(visited["site"].value_counts())

site
DR-3     4
DR-1     3
MSK-4    1
Name: count, dtype: int64

In [62]:

m2o_merge = site.merge(visited, left_on="name", right_on="site")
print(m2o_merge)

    name    lat    long  ident   site       dated
0   DR-1 -49.85 -128.57    619   DR-1  1927-02-08
1   DR-1 -49.85 -128.57    622   DR-1  1927-02-10
2   DR-1 -49.85 -128.57    844   DR-1  1932-03-22
3   DR-3 -47.15 -126.72    734   DR-3  1939-01-07
4   DR-3 -47.15 -126.72    735   DR-3  1930-01-12
5   DR-3 -47.15 -126.72    751   DR-3  1930-02-26
6   DR-3 -47.15 -126.72    752   DR-3         NaN
7  MSK-4 -48.87 -123.40    837  MSK-4  1932-01-14

In [63]:

ps = person.merge(survey, left_on="ident", right_on="person")
vs = visited.merge(survey, left_on="ident", right_on="taken")
print(ps)

   ident   personal   family  taken person quant  reading
0   dyer    William     Dyer    619   dyer   rad     9.82
1   dyer    William     Dyer    619   dyer   sal     0.13
2   dyer    William     Dyer    622   dyer   rad     7.80
3   dyer    William     Dyer    622   dyer   sal     0.09
4     pb      Frank  Pabodie    734     pb   rad     8.41
5     pb      Frank  Pabodie    734     pb  temp   -21.50
6     pb      Frank  Pabodie    735     pb   rad     7.22
7     pb      Frank  Pabodie    751     pb   rad     4.35
8     pb      Frank  Pabodie    751     pb  temp   -18.50
9   lake   Anderson     Lake    734   lake   sal     0.05
10  lake   Anderson     Lake    751   lake   sal     0.10
11  lake   Anderson     Lake    752   lake   rad     2.19
12  lake   Anderson     Lake    752   lake   sal     0.09
13  lake   Anderson     Lake    752   lake  temp   -16.00
14  lake   Anderson     Lake    837   lake   rad     1.46
15  lake   Anderson     Lake    837   lake   sal     0.21
16   roe  Valentina  Roerich    752    roe   sal    41.60
17   roe  Valentina  Roerich    837    roe   sal    22.50
18   roe  Valentina  Roerich    844    roe   rad    11.25

In [64]:

print(vs)

    ident   site       dated  taken person quant  reading
0     619   DR-1  1927-02-08    619   dyer   rad     9.82
1     619   DR-1  1927-02-08    619   dyer   sal     0.13
2     622   DR-1  1927-02-10    622   dyer   rad     7.80
3     622   DR-1  1927-02-10    622   dyer   sal     0.09
4     734   DR-3  1939-01-07    734     pb   rad     8.41
5     734   DR-3  1939-01-07    734   lake   sal     0.05
6     734   DR-3  1939-01-07    734     pb  temp   -21.50
7     735   DR-3  1930-01-12    735     pb   rad     7.22
8     735   DR-3  1930-01-12    735    NaN   sal     0.06
9     735   DR-3  1930-01-12    735    NaN  temp   -26.00
10    751   DR-3  1930-02-26    751     pb   rad     4.35
11    751   DR-3  1930-02-26    751     pb  temp   -18.50
12    751   DR-3  1930-02-26    751   lake   sal     0.10
13    752   DR-3         NaN    752   lake   rad     2.19
14    752   DR-3         NaN    752   lake   sal     0.09
15    752   DR-3         NaN    752   lake  temp   -16.00
16    752   DR-3         NaN    752    roe   sal    41.60
17    837  MSK-4  1932-01-14    837   lake   rad     1.46
18    837  MSK-4  1932-01-14    837   lake   sal     0.21
19    837  MSK-4  1932-01-14    837    roe   sal    22.50
20    844   DR-1  1932-03-22    844    roe   rad    11.25

In [65]:

print(ps["quant"].value_counts())

quant
rad     8
sal     8
temp    3
Name: count, dtype: int64

In [66]:

print(vs["quant"].value_counts())

quant
sal     9
rad     8
temp    4
Name: count, dtype: int64

In [67]:

ps_vs = ps.merge(
    vs,
    left_on=["quant"],
    right_on=["quant"]
)

In [68]:

print(ps_vs.loc[0, :])

ident_x            dyer
personal        William
family             Dyer
taken_x             619
person_x           dyer
quant               rad
reading_x          9.82
ident_y             619
site               DR-1
dated        1927-02-08
taken_y             619
person_y           dyer
reading_y          9.82
Name: 0, dtype: object

In [69]:

print(ps.shape)

(19, 7)

In [71]:

print(vs.shape)

(21, 7)

In [72]:

print(ps_vs.shape)

(148, 13)

In [73]:

assert vs.shape[0] == 21

In [74]:

assert ps_vs.shape[0] <= vs.shape[0]

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
Cell In[74], line 1
----> 1 assert ps_vs.shape[0] <= vs.shape[0]

AssertionError:

In [77]:

billboard = pd.read_csv("../data/billboard.csv")

billboard_long = billboard.melt(
    id_vars = ["year", "artist", "track", "time", "date.entered"],
    var_name="week",
    value_name="rating",
)

print(billboard_long.head())

   year        artist                    track  time date.entered week  rating
0  2000         2 Pac  Baby Don't Cry (Keep...  4:22   2000-02-26  wk1    87.0
1  2000       2Ge+her  The Hardest Part Of ...  3:15   2000-09-02  wk1    91.0
2  2000  3 Doors Down               Kryptonite  3:53   2000-04-08  wk1    81.0
3  2000  3 Doors Down                    Loser  4:24   2000-10-21  wk1    76.0
4  2000      504 Boyz            Wobble Wobble  3:35   2000-04-15  wk1    57.0

In [78]:

print(billboard_long.loc[billboard_long.track == 'Loser'])

       year        artist  track  time date.entered  week  rating
3      2000  3 Doors Down  Loser  4:24   2000-10-21   wk1    76.0
320    2000  3 Doors Down  Loser  4:24   2000-10-21   wk2    76.0
637    2000  3 Doors Down  Loser  4:24   2000-10-21   wk3    72.0
954    2000  3 Doors Down  Loser  4:24   2000-10-21   wk4    69.0
1271   2000  3 Doors Down  Loser  4:24   2000-10-21   wk5    67.0
...     ...           ...    ...   ...          ...   ...     ...
22510  2000  3 Doors Down  Loser  4:24   2000-10-21  wk72     NaN
22827  2000  3 Doors Down  Loser  4:24   2000-10-21  wk73     NaN
23144  2000  3 Doors Down  Loser  4:24   2000-10-21  wk74     NaN
23461  2000  3 Doors Down  Loser  4:24   2000-10-21  wk75     NaN
23778  2000  3 Doors Down  Loser  4:24   2000-10-21  wk76     NaN

[76 rows x 7 columns]

In [79]:

billboard_songs = billboard_long[
    ["year", "artist", "track", "time", "date.entered"]
]

print(billboard_songs.shape)

(24092, 5)

In [80]:

billboard_songs = billboard_songs.drop_duplicates()
print(billboard_songs.shape)

(317, 5)

In [81]:

billboard_songs['id'] = billboard_songs.index + 1
print(billboard_songs)

     year            artist                    track  time date.entered   id
0    2000             2 Pac  Baby Don't Cry (Keep...  4:22   2000-02-26    1
1    2000           2Ge+her  The Hardest Part Of ...  3:15   2000-09-02    2
2    2000      3 Doors Down               Kryptonite  3:53   2000-04-08    3
3    2000      3 Doors Down                    Loser  4:24   2000-10-21    4
4    2000          504 Boyz            Wobble Wobble  3:35   2000-04-15    5
..    ...               ...                      ...   ...          ...  ...
312  2000       Yankee Grey     Another Nine Minutes  3:10   2000-04-29  313
313  2000  Yearwood, Trisha          Real Live Woman  3:55   2000-04-01  314
314  2000   Ying Yang Twins  Whistle While You Tw...  4:19   2000-03-18  315
315  2000     Zombie Nation            Kernkraft 400  3:30   2000-09-02  316
316  2000   matchbox twenty                     Bent  4:12   2000-04-29  317

[317 rows x 6 columns]

In [82]:

billboard_ratings = billboard_long.merge(
    billboard_songs, on=["year", "artist", "track", "time", "date.entered"]
)
print(billboard_ratings.shape)

(24092, 8)

In [84]:

print(billboard_ratings.head())

   year artist                    track  time date.entered week  rating  id
0  2000  2 Pac  Baby Don't Cry (Keep...  4:22   2000-02-26  wk1    87.0   1
1  2000  2 Pac  Baby Don't Cry (Keep...  4:22   2000-02-26  wk2    82.0   1
2  2000  2 Pac  Baby Don't Cry (Keep...  4:22   2000-02-26  wk3    72.0   1
3  2000  2 Pac  Baby Don't Cry (Keep...  4:22   2000-02-26  wk4    77.0   1
4  2000  2 Pac  Baby Don't Cry (Keep...  4:22   2000-02-26  wk5    87.0   1

In [85]:

billboard_ratings = billboard_ratings[["id", "week", "rating"]]

In [86]:

print(billboard_ratings.head())

   id week  rating
0   1  wk1    87.0
1   1  wk2    82.0
2   1  wk3    72.0
3   1  wk4    77.0
4   1  wk5    87.0

In [ ]:

[08] Do it! 데이터 분석을 위한 판다스 입문 (0)	2024.01.08
[07] Do it! 데이터 분석을 위한 판다스 입문 (0)	2024.01.08
[05] Do it! 데이터 분석을 위한 판다스 입문 (0)	2024.01.06
[04] Do it! 데이터 분석을 위한 판다스 입문 (0)	2024.01.05
[03] Do it! 데이터 분석을 위한 판다스 입문 (0)	2024.01.04

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

늦었다고 안 하면 더 늦어요

[06] Do it! 데이터 분석을 위한 판다스 입문

6일차

07 데이터 결합하고 분해하기¶

07-1 데이터 묶어 분석하기¶

'도서 > 프로그래밍' 카테고리의 다른 글

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역

[06] Do it! 데이터 분석을 위한 판다스 입문

6일차

07 데이터 결합하고 분해하기¶

07-1 데이터 묶어 분석하기¶

'도서 > 프로그래밍' 카테고리의 다른 글

관련글

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역