🥷
🥷
文章目录
  1. Resources

写了个自动机器学习的项目

无论是在参加比赛,还是在实际工程中,把数据处理完之后,肯定不会只用一种算法进行测试,不可避免的采用多种算法进行比较。而autoclf就是这次在阿里风险支付大赛进行中,花了两天撸出来的一个简单框架。

目录结构为

1
2
3
4
5
6
7
8
9
10
├── [4.0K]  clf
│ │
│   ├── [4.0K] nn

├── [4.0K] data

├── [4.0K] pipe

└── [4.0K] saved

clf 目录下是常见的或者自定义的算法, 而nn 目录下是自定义的深度学习算法,和sklearn接口绑定的,自定义的算法是以类的形式存在的,只需要实现fit,score,predict即可,但是如果自己的fit使用了
sklearn的训练,就无需再自定义score,predict了。例如这样: (clf/isvc.py)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest,chi2

from sklearn.decomposition import PCA, NMF
from sklearn.svm import SVC



class IGridSVC():
N_FEATURES_OPTIONS = [2, 4]
C_OPTIONS = [1, 10, 100, 1000]
param_grid = [
{
'reduce_dim': [PCA(iterated_power=7), NMF()],
'reduce_dim__n_components': N_FEATURES_OPTIONS,
'classify__C': C_OPTIONS
},
{
'reduce_dim': [SelectKBest(chi2)],
'reduce_dim__k': N_FEATURES_OPTIONS,
'classify__C': C_OPTIONS
},]

pipe = Pipeline([
('reduce_dim', PCA()),
('classify', SVC( kernel="linear", probability=True))
])

def __init__(self):
self.model = None

def fit(self,x_train, y_train):
self.model = GridSearchCV(IGridSVC.pipe, cv=3, n_jobs=-1, param_grid=IGridSVC.param_grid)
self.model = self.model.fit(x_train,y_train)

def score(self,x_test,y_test):
return self.model.score(x_test,y_test)

def predict(self, x_test):
return self.model.predict(x_test)

def predict_proba(self, x_test):
return self.model.predict_proba(x_test)

然后在train.py里调用即可,工程还在继续完善,准备增加命令行接口。然而在训练中最重要的无疑是数据预处理和特征选择,而在train.py中传入的数据就是来自pipe文件夹里定义的预处理函数。
train.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os

import pandas as pd
import numpy as np

# Sklearn Common Import
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

# Decomposition
# PCA 无监督, LDA 有监督
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# Some Classifier Algorithms

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,\
ExtraTreesClassifier,GradientBoostingClassifier,VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression


def train(x_train,y_train,x_test,y_test):

clf1 = DecisionTreeClassifier() #max_depth=4a
clf1_1 = RandomForestClassifier()
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(kernel='rbf', probability=True) # So slowly
clf4 = LogisticRegression(random_state=1)
clf5 = XGBClassifier()
clf6 = GaussianNB()
clf7 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
algorithm="SAMME",n_estimators=200)

from clf import IGridSVC
c_clf1 = IGridSVC()

voting1 = VotingClassifier(
estimators=[
('dt',clf1),
('knn', clf2),
('svc', clf3),
('lg',clf4)
],
voting='soft'
)

voting2 = VotingClassifier(
estimators=[
('dt',clf1),
('knn', clf2),
('svc', clf3),
('lg',clf4),
('xgb',clf5)
],
voting='soft'
)

clfs = [
c_clf1,
clf3,
clf1,clf1_1,clf2,clf4,clf5,clf6,clf7,voting1,voting2
]

for clf in clfs:
name = clf.__class__.__name__
modeldumpname = "saved/{}.pkl".format(name.lower())

print("[*] Now Training With {:<10s}".format(name))

try:
clf.fit(x_train,y_train)
score = clf.score(x_test,y_test)
if os.path.isfile(modeldumpname):
print("[x] {} Already Exists".format(modeldumpname))
modeldumpname = "{}.second".format(modeldumpname)
print("[-] Rename {}".format(modeldumpname))

joblib.dump(clf,modeldumpname)

print("[+] Saving Model {:<10s} with accuracy: {}".format(modeldumpname,score))

except KeyboardInterrupt:
print("[-] Skip {}".format(name))
# if not name.startswith("i"): # custom class not implement cross valdation
# score = np.mean(cross_val_score(clf, x_train, y_train, cv=10))
# else:
# score = np.mean(clf.cross_val_score(x_train,y_train,cv=10))


if __name__ == '__main__':

print('Loading Data....',end='',flush=True)
from pipe import iload_iris_pipe
x_train, y_train, x_test, y_test = iload_iris_pipe()
print('\tDone')
train(x_train, y_train, x_test, y_test)

pipe/iload_aliaetc.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import pandas as pd
import os

from sklearn.model_selection import train_test_split

train_data_path = 'data/atec_anti_fraud_train.csv'
predict_data_path = 'data/atec_anti_fraud_test_a.csv'

DROPCOLUMS = ["id","label","date"]
# 0 .... 1, 0 is safe / 1 is not safe

def iload_aliatec_pipe():

if os.path.isfile(train_data_path) and os.path.isfile(predict_data_path):
print("[√] Path Checked, File Exists")
else:
print("[X] Please Make Sure Your Datasets Was Exists")
import sys
sys.exit(1)

data = pd.read_csv(train_data_path)
data = data.fillna(0)
unlabeled = data[data['label'] == -1]
labeled = data[data['label'] != -1]

train, test = train_test_split(labeled, test_size=0.2, random_state=42)

cols = [c for c in DROPCOLUMS if c in train.columns]
x_train = train.drop(cols,axis=1)

cols = [c for c in DROPCOLUMS if c in test.columns]
x_test = test.drop(cols,axis=1)

y_train = train['label']
y_test = test['label']
return x_train, y_train, x_test, y_test

def iload_predict_data():
upload_test = pd.read_csv(predict_data_path)
upload_test = upload_test.fillna(0)
upload_id = upload_test['id']

cols = [c for c in DROPCOLUMS if c in upload_test.columns]
upload_test = upload_test.drop(cols,axis=1)

return upload_id, upload_test


def isave_predict_data(data_id,predict,filename):
p = pd.DataFrame(predict,columns=["score"])
res = pd.concat([data_id,p],axis=1)
res.to_csv(filename,index=False)
print("[+] Save Predict Result To {} Sucessful".format(filename))

而预测是单独的predict.py文件,会自动加载由train.py训练完成后的模型,然后进行批量预测,并保存到相应的文件夹中。

ps: 1332 人参赛,目前居然只有36人提交结果。大佬都是在后面提交的啊。自己还是真的菜。不过让我想不明白的是,这个比赛每天只启动一次系统测评,不像之前的腾讯广告回流的那个,每次提交都会即时评测,可以方便调优,但这个又不是,搞不懂。不过我已经有了新的思路去做这个了。关键点不一定是那些个-1的标签,但一定是个突破点。

screenshot from 2018-05-03 23-58-02

Resources