## IMPORT CUSTOM CAPSTONE FUNCTIONSimportfunctions_combined_BESTasjifromfunctions_combined_BESTimportihelp, ihelp_menu,\
reload, inspect_variablesimportpandasaspdimportnumpyasnpimportmatplotlib.pyplotasplt# ## IMPORT MY PUBLISHED PYPI PACKAGE # import bs_ds as bs# from bs_ds.imports import *# Suppress warningsimportwarningswarnings.filterwarnings('ignore')
#Set pd.set_options for tweet visibilitypd.set_option('display.max_colwidth',100)
pd.set_option('display.max_columns',50)
bs_ds v0.9.10 loaded. Read the docs: https://bs-ds.readthedocs.io/en/latest/index.html
For convenient loading of standard modules use: from bs_ds.imports import *
def BlockTimeSeriesSplit
fromsklearn.model_selection._splitimport_BaseKFoldclassBlockTimeSeriesSplit(_BaseKFold): #sklearn.model_selection.TimeSeriesSplit):"""A variant of sklearn.model_selection.TimeSeriesSplit that keeps train_size and test_size constant across folds. Requires n_splits,train_size,test_size. train_size/test_size can be integer indices or float ratios """def__init__(self, n_splits=5,train_size=None, test_size=None, step_size=None, method='sliding'):
super().__init__(n_splits, shuffle=False, random_state=None)
self.train_size=train_sizeself.test_size=test_sizeself.step_size=step_sizeif'sliding'inmethodor'normal'inmethod:
self.method=methodelse:
raiseException("Method may only be 'normal' or 'sliding'")
defsplit(self,X,y=None, groups=None):
importmathmethod=self.method## Get n_samples, trian_size, test_size, step_sizen_samples=len(X)
test_size=self.test_sizetrain_size=self.train_size## If train size and test sze are ratios, calculate number of indicesiftrain_size<1.0:
train_size=math.floor(n_samples*train_size)
iftest_size<1.0:
test_size=math.floor(n_samples*test_size)
## Save the sizes (all in integer form)self._train_size=train_sizeself._test_size=test_size## calcualte and save k_fold_size k_fold_size=self._test_size+self._train_sizeself._k_fold_size=k_fold_sizeindices=np.arange(n_samples)
## Verify there is enough data to have non-overlapping k_foldsifmethod=='normal':
importwarningsifn_samples//self._k_fold_size<self.n_splits:
warnings.warn('The train and test sizes are too big for n_splits using method="normal"\n\ switching to method="sliding"')
method='sliding'self.method='sliding'ifmethod=='normal':
margin=0foriinrange(self.n_splits):
start=i*k_fold_sizestop=start+k_fold_size## change mid to match my own needsmid=int(start+self._train_size)
yieldindices[start: mid], indices[mid+margin: stop]
elifmethod=='sliding':
step_size=self.step_sizeifstep_sizeisNone: ## if no step_size, calculate one## DETERMINE STEP_SIZElast_possible_start=n_samples-self._k_fold_size#index[-1]-k_fold_size)\step_range=range(last_possible_start)
step_size=len(step_range)//self.n_splitsself._step_size=step_sizeforiinrange(self.n_splits):
ifi==0:
start=0else:
start=prior_start+self._step_size#(i * step_size)stop=start+k_fold_size## change mid to match my own needsmid=int(start+self._train_size)
prior_start=startyieldindices[start: mid], indices[mid: stop]
Note: this section was me learning to using Pipelines and ColumnTransformer. It is used for processing the data but is NOT meant to be an example.
## Using ColumnTransformerfromsklearn.model_selectionimportTimeSeriesSplit,train_test_split, GridSearchCV,cross_val_score,KFoldfromsklearn.pipelineimportPipelinefromsklearn.preprocessingimportStandardScaler, OneHotEncoder,MinMaxScalerfromsklearn.composeimportColumnTransformer, make_column_transformerfromsklearn.pipelineimportmake_pipelinefromsklearn.model_selectionimporttrain_test_splitfromsklearn.imputeimportSimpleImputerimportxgboostasxgbtarget_col='price_shifted'## Sort_indexcols_to_drop=['price','pred_classes_int']
cols_to_drop.append(target_col)
features=df_to_model.drop(cols_to_drop, axis=1)
target=df_to_model[target_col]
## Get boolean masks for which columns to usenumeric_cols=features.dtypes=='float'category_cols=~numeric_cols# target_col = df_to_model.columns=='price_shifted'price_transformer=Pipeline(steps=[
('scaler',MinMaxScaler())
])
## define pipeline for preparing numeric datanumeric_transformer=Pipeline(steps=[
# ('imputer',SimpleImputer(strategy='median')),
('scaler',MinMaxScaler())
])
category_transformer=Pipeline(steps=[
# ('imputer',SimpleImputer(missing_values=np.nan,# strategy='constant',fill_value='missing')),
('onehot',OneHotEncoder(handle_unknown='ignore'))
])
## define pipeline for preparing categorical datapreprocessor=ColumnTransformer(remainder='passthrough',
transformers=[
('num',numeric_transformer, numeric_cols),
('cat',category_transformer,category_cols)])
### ADDING MY OWN TRANSFORMATION SO CAN USE FEATURE IMPROTANCEdf_tf=pd.DataFrame()
## Define Number vs Category Colsnum_cols_list=numeric_cols[numeric_cols==True]
cat_cols_list=category_cols[category_cols==True]
forcolindf_to_model.columns:
ifcolinnum_cols_list:
# print(f'{col} is numeric')vals=df_to_model[col].valuestf_num=numeric_transformer.fit_transform(vals.reshape(-1,1))
try:
df_tf[col] =tf_num.flatten()
# print(f"{col} added")except:
continue# print(f'Error flattening {col}, shape={tf_num.shape}')# print(tf_num.shape)# print(tf_num[:10])ifcolincat_cols_list:
# print(f'{col} is categorical')df_temp=pd.get_dummies(df_to_model[col])#DataFrame(data=tf_cats[:],index=df_to_model.index)df_tf=pd.concat([df_tf,df_temp],axis=1)
df_tf.head()