File size: 2,647 Bytes
96edc51 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import pandas as pd
import numpy as np
def data_transform(df_all,first_day_future):
df_all['CPI']=df_all['CPI'].astype('float')
df_all['Employment']=df_all['Employment'].astype('float')
# Add future row and Shift X columns
df_future_row=pd.DataFrame({0: df_all.reset_index().columns,1:df_all.reset_index().iloc[-1,:]}).T
df_future_row.columns=df_future_row.iloc[0,:]
df_future_row=df_future_row.drop(0).drop(columns=['index'])
df_future_row.insert(0,'',pd.to_datetime(first_day_future).strftime("%Y-%m-%d"))
df_future_row.set_index(df_future_row.iloc[:,0],inplace=True)
df_future_row=df_future_row.drop(columns='')
if 'level_0' in df_future_row.columns:
df_future_row=df_future_row.drop(columns='level_0')
if 'index' in df_all.columns:
df_all=df_all.drop(columns='index')
df_with_future=pd.concat([df_all,df_future_row],axis=0)
df_with_future.index.names=['date']
df_with_future.index=pd.to_datetime(df_with_future.index).strftime("%Y-%m-%d")
df_with_future=df_with_future.shift()
df_final=df_with_future.interpolate(method='linear',limit_direction='both',
limit=100).bfill().ffill()
df_final['name']=np.repeat(df_final['id'].iloc[:,0].dropna()[0:1][0],len(df_final))
df_final=df_final.drop(columns='id')
#Data transformation coin_dummy, time_variables, shift X, iso_week
df_final['name_no']=pd.get_dummies(df_final['name'],dtype='int')
df_final.index=pd.to_datetime(df_final.index, utc=True)
df_final['Day']=df_final.index.day
df_final['Month']=df_final.index.month
df_final['Year']=df_final.index.year
seasonal_dummy=pd.get_dummies(df_final.index.day,dtype='int')
seasonal_dummy.index=df_final.index
seasonal_dummy.columns=[str('day_'+str(value)) for value in seasonal_dummy.columns]
reframed=pd.concat([df_final,seasonal_dummy],axis=1).drop(columns='name')
print(reframed.iloc[-5:,:])
reframed=reframed.reset_index().drop(columns=['date'])
reframed_lags=reframed.copy()
reframed_lags['lag1'] = reframed_lags['prices'].iloc[-1]
reframed_lags['lag2'] = reframed_lags['prices'].iloc[-2]
# Use the last observed values for lag features
for i in range(1, len(reframed_lags)):
reframed_lags.loc[reframed_lags.index[i], 'lag1'] = reframed_lags.loc[reframed_lags.index[i-1], 'prices'] if 'prices' in reframed_lags.columns else reframed_lags.loc[reframed_lags.index[i-1], 'lag1']
reframed_lags.loc[reframed_lags.index[i], 'lag2'] = reframed_lags.loc[reframed_lags.index[i-1], 'lag1']
return reframed_lags, df_final |