In [90]:
import pandas as pd
import csv
from numpy import *
import sklearn
import numpy as np
from sklearn import linear_model
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
import time
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
import copy
In [91]:
raw_data = pd.read_csv('processed_Df.csv', parse_dates= ['date'], index_col='date')
y_temp = raw_data['BGEDAe'] - raw_data['BGERTe'] - abs(raw_data['OPRES'])
pd.options.mode.chained_assignment = None  # default='warn'

data_lag  = 48
bucketSize = 24 #how many hours to group
#now divide the data into blocks of d days for regression and 1 day for testing
reg_days = 15 #30 worked well
reg_time = 24/bucketSize * reg_days # how many points to regress

lfInds = [0,1,2,4,6] #which load forecast indicies to use as features
#get the Data
X_temp = raw_data.iloc[:,lfInds]
X_temp.iloc[:,range(len(lfInds ))] =X_temp.iloc[:,range(len(lfInds ))]/1000 #scale load forecast data
X_temp['shift'] = y_temp.shift( data_lag ).fillna(0)  # at decision time you have lagged y values
X_temp['WH_recent'] = raw_data['WESTERN HUBDAe'].shift(24).fillna(0) #at decsion time DA has clear for current day
#combine data into buckets of bucketSize
y_temp = pd.stats.moments.rolling_mean(y_temp, bucketSize )
X_temp = pd.stats.moments.rolling_mean(X_temp, bucketSize )
X = X_temp.iloc[bucketSize::bucketSize,:]
y = y_temp.iloc[bucketSize::bucketSize]
In [105]:
clf = RandomForestClassifier(n_estimators=8, max_depth =3,min_samples_split =5, verbose=0, random_state= 55)#verbose =3 for info
curr_date = data_lag/bucketSize # adjust for the lag
y_pred= []
y_act = []
#x_val = []
#date adjusts for regression and test intervals
st_dt_adj  = 0
end_dt_adj = 0
#build and test model for each interval
while (curr_date + reg_time + 24/bucketSize + st_dt_adj ) < len(X - end_dt_adj):
    X_train = X.iloc[ curr_date:curr_date+reg_time, : ]
    X_test  = X.iloc[ curr_date+reg_time:curr_date+reg_time+24/bucketSize, : ]
    y_train = y[ curr_date:curr_date+reg_time ]  
    y_test  = y[ curr_date+reg_time:curr_date+reg_time+24/bucketSize ]
    clf.fit(X_train, y_train )
    y_act.append( y_test[0] )
    y_pred.append(clf.predict(X_test)[0])
   # x_val.append( X_test )
    curr_date = curr_date + 24/bucketSize

y_pred = array(y_pred)
y_act  = array(y_act)
y_adj_val = copy.deepcopy(y_act)
y_adj_val[ abs(y_pred) > 1 ] = 0 #take stragtegy that volitile days are bad
y_adj_val2 = copy.deepcopy(y_act)                
y_adj_val2[ y_pred < 0 ] = 0 # Choose RT price when model tells you to.                                              
In [106]:
plt.plot( pd.stats.moments.rolling_mean( y_act, 30 ) ,  label="Actual" )
plt.plot( pd.stats.moments.rolling_mean( y_adj_val, 30 ), label="Strategy 1" )
plt.plot( pd.stats.moments.rolling_mean( y_adj_val2, 30 ), label="Strategy 2" )
plt.title( "Historical Rolling 30-day Payoff ($/MWh)" )
plt.ylabel('$/MWh')
plt.legend(loc=2)
Out[106]:
<matplotlib.legend.Legend at 0x11346f650>
In [1]:
#%cd Desktop/models/
/Users/amihalik/Desktop/models

In [107]:
plt.hist([y_act[30::30], y_adj_val[30::30],y_adj_val2[30::30] ] , bins = 30, label=['Actual: %.1f' %std(y_act[30::30]), 'S1: %.1f' %std(y_adj_val[30::30]), 'S2: %.1f' %std(y_adj_val2[30::30])]);
plt.legend(loc=2)
plt.title( "BGE 30-day PnL Histogram ($/MWh)" )
plt.xlabel('$/MWh PnL')
plt.ylabel('Frequency')
Out[107]:
<matplotlib.text.Text at 0x113481350>
In [108]:
print mean(y_act)
print mean(y_adj_val )
print mean(y_adj_val2 )
-0.0611442371379
0.070713647678
0.387131911454

In [109]:
print ((sum((y_act[y_pred>0] -y_pred[y_pred>0]*0+.15)**2)+sum(y_act[y_pred<=0]**2)))/sum(y_act**2)
print ((sum((y_act[abs(y_pred)<1] -y_pred[abs(y_pred)<1]*0+.15)**2)+sum(y_act[abs(y_pred)>=1]**2)))/sum(y_act**2)
1.00030505029
1.00005720487

In [96]:
 
In [110]:
#SHOW RESULTS FOR ANOTHER LOAD ZONE
raw_data = pd.read_csv('processed_Df.csv', parse_dates= ['date'], index_col='date')
y_temp = raw_data['APSDAe'] - raw_data['APSRTe'] - abs(raw_data['OPRES'])
pd.options.mode.chained_assignment = None  # default='warn'

data_lag  = 48
bucketSize = 24 #how many hours to group
#now divide the data into blocks of d days for regression and 1 day for testing
reg_days = 15 #30 worked well
reg_time = 24/bucketSize * reg_days # how many points to regress

lfInds = [0,1,2,4,6] #which load forecast indicies to use as features
#get the Data
X_temp = raw_data.iloc[:,lfInds]
X_temp.iloc[:,range(len(lfInds ))] =X_temp.iloc[:,range(len(lfInds ))]/1000 #scale load forecast data
X_temp['shift'] = y_temp.shift( data_lag ).fillna(0)  # at decision time you have lagged y values
X_temp['WH_recent'] = raw_data['WESTERN HUBDAe'].shift(24).fillna(0) #at decsion time DA has clear for current day
#combine data into buckets of bucketSize
y_temp = pd.stats.moments.rolling_mean(y_temp, bucketSize )
X_temp = pd.stats.moments.rolling_mean(X_temp, bucketSize )
X = X_temp.iloc[bucketSize::bucketSize,:]
y = y_temp.iloc[bucketSize::bucketSize]
In [111]:
clf = RandomForestClassifier(n_estimators=8, max_depth =3,min_samples_split =5, verbose=0, random_state= 55)#verbose =3 for info
curr_date = data_lag/bucketSize # adjust for the lag
y_pred= []
y_act = []
#x_val = []
#date adjusts for regression and test intervals
st_dt_adj  = 0
end_dt_adj = 400
#build and test model for each interval
while (curr_date + reg_time + 24/bucketSize + st_dt_adj ) < len(X - end_dt_adj):
    X_train = X.iloc[ curr_date:curr_date+reg_time, : ]
    X_test  = X.iloc[ curr_date+reg_time:curr_date+reg_time+24/bucketSize, : ]
    y_train = y[ curr_date:curr_date+reg_time ]  
    y_test  = y[ curr_date+reg_time:curr_date+reg_time+24/bucketSize ]
    clf.fit(X_train, y_train )
    y_act.append( y_test[0] )
    y_pred.append(clf.predict(X_test)[0])
   # x_val.append( X_test )
    curr_date = curr_date + 24/bucketSize

y_pred = array(y_pred)
y_act  = array(y_act)
y_adj_val = copy.deepcopy(y_act)
y_adj_val[ abs(y_pred) > 1 ] = 0 #take stragtegy that volitile days are bad
y_adj_val2 = copy.deepcopy(y_act)                
y_adj_val2[ y_pred < 0 ] = 0 # Choose RT price when model tells you to. 
In [112]:
plt.hist([y_act[30::30], y_adj_val[30::30],y_adj_val2[30::30] ] , bins = 30, label=['Actual: %.1f' %std(y_act[30::30]), 'S1: %.1f' %std(y_adj_val[30::30]), 'S2: %.1f' %std(y_adj_val2[30::30])]);
plt.legend(loc=2)
plt.title( "APZ zonal 30-day PnL Histogram ($/MWh)" )
plt.xlabel('$/MWh PnL')
plt.ylabel('Frequency')
Out[112]:
<matplotlib.text.Text at 0x11b5e6710>
In [113]:
print mean(y_act)
print mean(y_adj_val )
print mean(y_adj_val2 )
-0.0611442371379
0.070713647678
0.387131911454

In []: