普通的特征

不要认为结果集的month没有出现在训练集就不加,要相信模型的能力。

1
2
3
4
5
6
7
8
9
10
ID = 'DateTime'
# df['year'] = df[ID].dt.year
df['month']=df[ID].dt.month
df['quarter']=df[ID].dt.quarter
df['day']=df[ID].dt.day
df['dayofweek']=df[ID].dt.dayofweek
df['dayofyear']=df[ID].dt.dayofyear
df['week']=df[ID].dt.week
df['hour']=df[ID].dt.hour
df['weekend']=df['dayofweek'].apply(lambda x:1 if x>=5 else 0)

傅里叶级数

使得月份,天数等等连续,例如1-12月,1和12差距过大,如果使用傅里叶级数的话相当于变成sin或者cos表示,形成一个圆。

  • 其中fs是看数据的,像这个数据给的是一小时采样6次数据,一年就是1*6*24*365,如果是一天一采的数据就365
1
2
3
4
5
6
from scipy.signal import periodogram
F,S=periodogram(tr[out_cols],fs=1*6*24*365,detrend='linear')
plt.plot(F,S)
plt.xscale('log')
plt.xticks([1,2,4,12,24,52,104,365,365*2],['A','hA','Q','M','hM','W','hW','D','hD'])
plt.show()
  • 查看图像最高的值,时间相关性就会比较高
1
2
3
4
5
6
'B' - business day, ie., Mon. - Fri.
'D' - daily
'W' - weekly
'M' - monthly
'A' - annual
'Q' - quarterly
1
2
3
4
5
6
7
from statsmodels.tsa.deterministic import DeterministicProcess,CalendarFourier
#傅里叶
CF1=CalendarFourier('A',1)
CF2=CalendarFourier('W',2)
CF3=CalendarFourier('D',4)
dp = DeterministicProcess(index=df.DateTime,additional_terms=[CF1,CF2,CF3,CF4]).in_sample()
df = df.merge(dp,left_on='datetime',right_index=True,how='left')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def make_m(m={}):
m['RandomForestRegressor'] = RandomForestRegressor(random_state=10)
m['GradientBoostingRegressor'] = GradientBoostingRegressor(random_state=10)
m['ExtraTreesRegressor'] = ExtraTreesRegressor(random_state=10)
m['LGBMRegressor'] = LGBMRegressor(random_state=10)
m['LinearRegression'] = LinearRegression()
m['HuberRegressor'] = HuberRegressor()
return m

def split_df(df,name):
train = df[:tr.shape[0]]
test = df[tr.shape[0]:]
x = train.drop(name,1)
x_pre = test.drop(name,1)
y = train[name]
return x,y,x_pre

def make_p(l,m):
s = l.copy()
s.append(('m',m))
return Pipeline(s)

def valid_cv(p,x,y,cv=5,log=True):
return cross_val_score(p,x,y,cv=cv).mean()

def valid(p,x,y,cv=5,log=True):
s = 0
tss = TimeSeriesSplit(n_splits=cv)
for tri,tei in tss.split(x):
Xtr,Xte = x.iloc[tri,:],x.iloc[tei]
Ytr,Yte = y.iloc[tri],y.iloc[tei]
ans = p.fit(Xtr,Ytr).predict(Xte)
if log:
ans = np.expm1(ans)
s+=r2_score(Yte,ans)
return s/cv

def test_df(l,df,name,log=True,s={},func=valid):
x,y,x_pre = split_df(df,name)
if log:
y = np.log1p(y)
for n,m in make_m().items():
p = make_p(l,m)
s[n] = func(p,x,y)
return sorted(s.items(),key = lambda x:x[1],reverse=True)

diff差分/shift平移/ewm/rolling

主要用于特征制作lag的值,可以多尝试,target值也可以尝试ewm,但是这道题没用。

  • diff(i)
1
2
3
4
5
features =['Temperature','Humidity']
rang = [1,2,3]
for f in features:
for i in rang:
df[f'{f}_lag_{i}'] = df[f].diff(i).bfill()
  • rolling.mean()
1
2
3
4
5
features =['Temperature','Humidity']
rang = [1,2,3]
for f in features:
for i in rang:
df[f'{f}_rolling_{i}'] = df[f].rolling(i).mean().bfill()
  • ewm 标签(可能数据泄露?)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
def ts_ft(x, by, on):
x.sort_values(by+['dt'], inplace=True)
names = ['mean', 'std', 'median', 'min', 'max', 'skew', 'kurt']
sta = {s:[] for s in names}
alphas = [0.1, 0.3, 0.5, 0.7, 0.9]
for a in alphas:
sta['ewm_{}'.format(a)] = []

for _, g in x.groupby(by):
val = g[on]
rolled = val.rolling(window=len(g), min_periods=1)
for s in names:
sta[s].extend(getattr(rolled, s)().fillna(method='bfill'))

for a in alphas:
ewm = val.ewm(alpha=a, adjust=False).mean()
sta['ewm_{}'.format(a)].extend(ewm)

for s in sta:
ft_name = '{}_on_{}_by_{}'.format(s, on, by)
x[ft_name] = sta[s]
  • ewm 特征变量
1
2
3
4
5
6
7
8
9
10
def time_series_feature(X,days=1):
for alpha in np.linspace(0.1, 1, 10):
# X_train[f"{alpha}ewm_var1"] = X_train['var1'].ewm(alpha=alpha).mean()
for n in ['temperature', 'windspeed']:
X[f"{alpha}ewm_{n}"] = X[n].shift(days).ewm(alpha=alpha, adjust=False).mean().bfill()
for m in ['var1', 'temperature', 'windspeed']:
w = int(alpha*20)
# rolled = X[m].shift().rolling(window=w, min_periods=1)
X[f"{w}std_{m}"] = X[m].shift(days).rolling(window=w, min_periods=1).std().bfill()
return X
  • lagging 标签(可能没啥用,缺失值太多了)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
def create_lagging(df, df_original, i):
df1 = df_original.copy()
df1['visit_date'] = df1['visit_date'] + pd.DateOffset(
days=i)
df1 = df1.rename(columns={'visitors': 'lagging' + str(i)})
df2 = pd.merge(df,
df1[['air_store_id', 'visit_date', 'lagging' + str(i)]],
on=['air_store_id', 'visit_date'],
how='left')
return df2

lagging = 21
data1 = df.copy()
for i in range(2, lagging+1):
data1 = create_lagging(data1, df, i)

验证方法

  • 只能看是否改进了,但是分数偏低
1
2
3
4
5
6
7
8
9
10
11
# 每次循环都会增大X,在这题比较准确。
def valid(p,X,y,cv=5):
tss = TimeSeriesSplit(max_train_size=None, n_splits=cv)
s = 0
for tr_i,te_i in tss.split(x):
# print(tr_i)
X_train,X_test = X.iloc[tr_i,:],X.iloc[te_i,:]
y_train,y_test = y.iloc[tr_i,:],y.iloc[te_i,:]
# y_train,y_test = y.iloc[tr_i],y.iloc[te_i] y为一维的时候
s+=r2_score(y_test,p.fit(X_train,y_train).predict(X_test))
return s/cv
  • 经典的交叉验证分数比较高,能把控一下大概方向对不对。
1
2
def valid_cv(p,x,y,cv=5,log=True):
return cross_val_score(p,x,y,cv=cv).mean()