|
import pandas as pd |
|
import numpy as np |
|
|
|
def data_transform(df_all,first_day_future): |
|
df_all['CPI']=df_all['CPI'].astype('float') |
|
df_all['Employment']=df_all['Employment'].astype('float') |
|
|
|
|
|
df_future_row=pd.DataFrame({0: df_all.reset_index().columns,1:df_all.reset_index().iloc[-1,:]}).T |
|
|
|
df_future_row.columns=df_future_row.iloc[0,:] |
|
|
|
df_future_row=df_future_row.drop(0).drop(columns=['index']) |
|
df_future_row.insert(0,'',pd.to_datetime(first_day_future).strftime("%Y-%m-%d")) |
|
df_future_row.set_index(df_future_row.iloc[:,0],inplace=True) |
|
df_future_row=df_future_row.drop(columns='') |
|
if 'level_0' in df_future_row.columns: |
|
df_future_row=df_future_row.drop(columns='level_0') |
|
if 'index' in df_all.columns: |
|
df_all=df_all.drop(columns='index') |
|
|
|
df_with_future=pd.concat([df_all,df_future_row],axis=0) |
|
df_with_future.index.names=['date'] |
|
df_with_future.index=pd.to_datetime(df_with_future.index).strftime("%Y-%m-%d") |
|
df_with_future=df_with_future.shift() |
|
df_final=df_with_future.interpolate(method='linear',limit_direction='both', |
|
limit=100).bfill().ffill() |
|
df_final['name']=np.repeat(df_final['id'].iloc[:,0].dropna()[0:1][0],len(df_final)) |
|
df_final=df_final.drop(columns='id') |
|
|
|
df_final['name_no']=pd.get_dummies(df_final['name'],dtype='int') |
|
df_final.index=pd.to_datetime(df_final.index, utc=True) |
|
df_final['Day']=df_final.index.day |
|
df_final['Month']=df_final.index.month |
|
df_final['Year']=df_final.index.year |
|
seasonal_dummy=pd.get_dummies(df_final.index.day,dtype='int') |
|
seasonal_dummy.index=df_final.index |
|
seasonal_dummy.columns=[str('day_'+str(value)) for value in seasonal_dummy.columns] |
|
reframed=pd.concat([df_final,seasonal_dummy],axis=1).drop(columns='name') |
|
print(reframed.iloc[-5:,:]) |
|
reframed=reframed.reset_index().drop(columns=['date']) |
|
reframed_lags=reframed.copy() |
|
reframed_lags['lag1'] = reframed_lags['prices'].iloc[-1] |
|
reframed_lags['lag2'] = reframed_lags['prices'].iloc[-2] |
|
|
|
|
|
for i in range(1, len(reframed_lags)): |
|
reframed_lags.loc[reframed_lags.index[i], 'lag1'] = reframed_lags.loc[reframed_lags.index[i-1], 'prices'] if 'prices' in reframed_lags.columns else reframed_lags.loc[reframed_lags.index[i-1], 'lag1'] |
|
reframed_lags.loc[reframed_lags.index[i], 'lag2'] = reframed_lags.loc[reframed_lags.index[i-1], 'lag1'] |
|
return reframed_lags, df_final |