1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
| def make_m(m={}): m['RandomForestRegressor'] = RandomForestRegressor(random_state=10) m['GradientBoostingRegressor'] = GradientBoostingRegressor(random_state=10) m['ExtraTreesRegressor'] = ExtraTreesRegressor(random_state=10) m['LGBMRegressor'] = LGBMRegressor(random_state=10) m['LinearRegression'] = LinearRegression() m['HuberRegressor'] = HuberRegressor() return m
def split_df(df,name): train = df[:tr.shape[0]] test = df[tr.shape[0]:] x = train.drop(name,1) x_pre = test.drop(name,1) y = train[name] return x,y,x_pre
def make_p(l,m): s = l.copy() s.append(('m',m)) return Pipeline(s)
def valid_cv(p,x,y,cv=5,log=True): return cross_val_score(p,x,y,cv=cv).mean()
def valid(p,x,y,cv=5,log=True): s = 0 tss = TimeSeriesSplit(n_splits=cv) for tri,tei in tss.split(x): Xtr,Xte = x.iloc[tri,:],x.iloc[tei] Ytr,Yte = y.iloc[tri],y.iloc[tei] ans = p.fit(Xtr,Ytr).predict(Xte) if log: ans = np.expm1(ans) s+=r2_score(Yte,ans) return s/cv
def test_df(l,df,name,log=True,s={},func=valid): x,y,x_pre = split_df(df,name) if log: y = np.log1p(y) for n,m in make_m().items(): p = make_p(l,m) s[n] = func(p,x,y) return sorted(s.items(),key = lambda x:x[1],reverse=True)
|