기계 학습을 위해 Python을 사용하는 것이 매우 새로운 개념입니다. 파이썬은 포트란에서 프로그래밍의 배경에서 온 것입니다. 그래서 당신이 상상할 수 있듯이, 파이썬은 꽤 도약입니다. 저는 화학 분야에서 일하며 화학적 방법 (화학에 데이터 과학 기술 적용)에 관여했습니다. 따라서 비단뱀의 광범위한 기계 학습 라이브러리의 응용이 중요합니다. 나는 또한 코드가 효율적이어야한다. 나는 실행되고 OK로 작동하는 코드를 작성했습니다. 내가 알고 싶은 것은 다음과 같습니다 :Python 임의의 포리스트 및 기계 학습 - 개선
1 어떻게 개선하고/효율적으로 만드는지.
2 내가 사용했던 대안에 대한 제안이나 가능하다면 왜 다른 경로가 더 우수한 이유일까요?
나는 지속적인 데이터 및 회귀 모델로 작업하는 경향이 있습니다.
제안 사항이 좋을 수 있으며 사전에 감사드립니다.
import scipy
import math
import numpy as np
import pandas as pd
import plotly.plotly as py
import os.path
import sys
from time import time
from sklearn import preprocessing, metrics, cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
fname = str(raw_input('Please enter the input file name containing total dataset and descriptors (assumes csv file, column headings and first column are labels\n'))
if os.path.isfile(fname) :
SubFeAll = pd.read_csv(fname, sep=",")
else:
sys.exit("ERROR: input file does not exist")
#SubFeAll = pd.read_csv(fname, sep=",")
SubFeAll = SubFeAll.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor
header = SubFeAll.columns.values # Use the column headers as the descriptor labels
SubFeAll.head()
# Set the numpy global random number seed (similar effect to random_state)
np.random.seed(1)
# Random Forest results initialised
RFr2 = []
RFmse = []
RFrmse = []
# Predictions results initialised
RFpredictions = []
metcount = 0
# Give the array from pandas to numpy
npArray = np.array(SubFeAll)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay = npArray.shape
# Print specific nparray values to check the data
print("The first element of the input data set, as a minial check please ensure this is as expected = %s" % npArray[0,0])
# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
print X.shape
# Open output files
train_name = "Training.csv"
test_name = "Predictions.csv"
fi_name = "Feature_importance.csv"
with open(train_name,'w') as ftrain, open(test_name,'w') as fpred, open(fi_name,'w') as ffeatimp:
ftrain.write("This file contains the training information for the Random Forest models\n")
ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n")
ftrain.write("Interation %d ,\n" %(metcount+1))
fpred.write("This file contains the prediction information for the Random Forest models\n")
fpred.write("Predictions are made over a ten fold cross validation hence training on 90% test on 10%. The final prediction are return iteratively over this ten fold cros validation once,\n")
fpred.write("optimised parameters are located via a grid search at each fold,\n")
fpred.write("Interation %d ,\n" %(metcount+1))
ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n")
ffeatimp.write("Interation %d ,\n" %(metcount+1))
# Begin the K-fold cross validation over ten folds
kf = KFold(datax, n_folds=10, shuffle=True, random_state=0)
print "------------------- Begining Ten Fold Cross Validation -------------------"
for train, test in kf:
XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test]
ytestdim = yTest.shape[0]
print("The test set values are : ")
i = 0
if ytestdim%5 == 0:
while i < ytestdim:
print round(yTest[i],2),'\t', round(yTest[i+1],2),'\t', round(yTest[i+2],2),'\t', round(yTest[i+3],2),'\t', round(yTest[i+4],2)
ftrain.write(str(round(yTest[i],2))+','+ str(round(yTest[i+1],2))+','+str(round(yTest[i+2],2))+','+str(round(yTest[i+3],2))+','+str(round(yTest[i+4],2))+',\n')
i += 5
elif ytestdim%4 == 0:
while i < ytestdim:
print round(yTest[i],2),'\t', round(yTest[i+1],2),'\t', round(yTest[i+2],2),'\t', round(yTest[i+3],2)
ftrain.write(str(round(yTest[i],2))+','+str(round(yTest[i+1],2))+','+str(round(yTest[i+2],2))+','+str(round(yTest[i+3],2))+',\n')
i += 4
elif ytestdim%3 == 0 :
while i < ytestdim :
print round(yTest[i],2),'\t', round(yTest[i+1],2),'\t', round(yTest[i+2],2)
ftrain.write(str(round(yTest[i],2))+','+str(round(yTest[i+1],2))+','+str(round(yTest[i+2],2))+',\n')
i += 3
elif ytestdim%2 == 0 :
while i < ytestdim :
print round(yTest[i],2), '\t', round(yTest[i+1],2)
ftrain.write(str(round(yTest[i],2))+','+str(round(yTest[i+1],2))+',\n')
i += 2
else :
while i< ytestdim :
print round(yTest[i],2)
ftrain.write(str(round(yTest[i],2))+',\n')
i += 1
print "\n"
# random forest grid search parameters
print "------------------- Begining Random Forest Grid Search -------------------"
rfparamgrid = {"n_estimators": [10], "max_features": ["auto", "sqrt", "log2"], "max_depth": [5,7]}
rf = RandomForestRegressor(random_state=0,n_jobs=2)
RfGridSearch = GridSearchCV(rf,param_grid=rfparamgrid,scoring='mean_squared_error',cv=10)
start = time()
RfGridSearch.fit(XTrain,yTrain)
# Get best random forest parameters
print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_)))
RFtime = time() - start,len(RfGridSearch.grid_scores_)
#print(RfGridSearch.grid_scores_) # Diagnos
print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators'])
ne = RfGridSearch.best_params_['n_estimators']
print("max_features = %s " % RfGridSearch.best_params_['max_features'])
mf = RfGridSearch.best_params_['max_features']
print("max_depth = %d " % RfGridSearch.best_params_['max_depth'])
md = RfGridSearch.best_params_['max_depth']
ftrain.write("Random Forest")
ftrain.write("RF search time, %s ,\n" % (str(RFtime)))
ftrain.write("Number of Trees, %s ,\n" % str(ne))
ftrain.write("Number of feature at split, %s ,\n" % str(mf))
ftrain.write("Max depth of tree, %s ,\n" % str(md))
# Train random forest and predict with optimised parameters
print("\n\n------------------- Starting opitimised RF training -------------------")
optRF = RandomForestRegressor(n_estimators = ne, max_features = mf, max_depth = md, random_state=0)
optRF.fit(XTrain, yTrain) # Train the model
RFfeatimp = optRF.feature_importances_
indices = np.argsort(RFfeatimp)[::-1]
print("Training R2 = %5.2f" % optRF.score(XTrain,yTrain))
print("Starting optimised RF prediction")
RFpreds = optRF.predict(XTest)
print("The predicted values now follow :")
RFpredsdim = RFpreds.shape[0]
i = 0
if RFpredsdim%5 == 0:
while i < RFpredsdim:
print round(RFpreds[i],2),'\t', round(RFpreds[i+1],2),'\t', round(RFpreds[i+2],2),'\t', round(RFpreds[i+3],2),'\t', round(RFpreds[i+4],2)
i += 5
elif RFpredsdim%4 == 0:
while i < RFpredsdim:
print round(RFpreds[i],2),'\t', round(RFpreds[i+1],2),'\t', round(RFpreds[i+2],2),'\t', round(RFpreds[i+3],2)
i += 4
elif RFpredsdim%3 == 0 :
while i < RFpredsdim :
print round(RFpreds[i],2),'\t', round(RFpreds[i+1],2),'\t', round(RFpreds[i+2],2)
i += 3
elif RFpredsdim%2 == 0 :
while i < RFpredsdim :
print round(RFpreds[i],2), '\t', round(RFpreds[i+1],2)
i += 2
else :
while i< RFpredsdim :
print round(RFpreds[i],2)
i += 1
print "\n"
RFr2.append(optRF.score(XTest, yTest))
RFmse.append(metrics.mean_squared_error(yTest,RFpreds))
RFrmse.append(math.sqrt(RFmse[metcount]))
print ("Random Forest prediction statistics for fold %d are; MSE = %5.2f RMSE = %5.2f R2 = %5.2f\n\n" % (metcount+1, RFmse[metcount], RFrmse[metcount],RFr2[metcount]))
ftrain.write("Random Forest prediction statistics for fold %d are, MSE =, %5.2f, RMSE =, %5.2f, R2 =, %5.2f,\n\n" % (metcount+1, RFmse[metcount], RFrmse[metcount],RFr2[metcount]))
ffeatimp.write("Feature importance rankings from random forest,\n")
for i in range(RFfeatimp.shape[0]) :
ffeatimp.write("%d. , feature %d , %s, (%f),\n" % (i + 1, indices[i], npheader[indices[i]], RFfeatimp[indices[i]]))
# Store prediction in original order of data (itest) whilst following through the current test set order (j)
metcount += 1
ftrain.write("Fold %d, \n" %(metcount))
print "------------------- Next Fold %d -------------------" %(metcount+1)
j = 0
for itest in test :
RFpredictions.append(RFpreds[j])
j += 1
lennames = names.shape[0]
lenpredictions = len(RFpredictions)
lentrue = y.shape[0]
if lennames == lenpredictions == lentrue :
fpred.write("Names/Label,, Prediction Random Forest,, True Value,\n")
for i in range(0,lennames) :
fpred.write(str(names[i])+",,"+str(RFpredictions[i])+",,"+str(y[i])+",\n")
else :
fpred.write("ERROR - names, prediction and true value array size mismatch. Dumping arrays for manual inspection in predictions.csv\n")
fpred.write("Array printed in the order names/Labels, predictions RF and true values\n")
fpred.write(names+"\n")
fpred.write(RFpredictions+"\n")
fpred.write(y+"\n")
sys.exit("ERROR - names, prediction and true value array size mismatch. Dumping arrays for manual inspection in predictions.csv")
print "Final averaged Random Forest metrics : "
RFamse = sum(RFmse)/10
RFmse_sd = np.std(RFmse)
RFarmse = sum(RFrmse)/10
RFrmse_sd = np.std(RFrmse)
RFslope, RFintercept, RFr_value, RFp_value, RFstd_err = scipy.stats.linregress(RFpredictions, y)
RFR2 = RFr_value**2
print "Average Mean Squared Error = ", RFamse, " +/- ", RFmse_sd
print "Average Root Mean Squared Error = ", RFarmse, " +/- ", RFrmse_sd
print "R2 Final prediction against True values = ", RFR2
fpred.write("\n")
fpred.write("FINAL PREDICTION STATISTICS,\n")
fpred.write("Random Forest average MSE, %s, +/-, %s,\n" %(str(RFamse), str(RFmse_sd)))
fpred.write("Random Forest average RMSE, %s, +/-, %s,\n" %(str(RFarmse), str(RFrmse_sd)))
fpred.write("Random Forest slope, %s, Random Forest intercept, %s,\n" %(str(RFslope), str(RFintercept)))
fpred.write("Random Forest standard error, %s,\n" %(str(RFstd_err)))
fpred.write("Random Forest R, %s,\n" %(str(RFr_value)))
fpred.write("Random Forest R2, %s,\n" %(str(RFR2)))
ftrain.close()
fpred.close()
ffeatimp.close()
당신은 또한 데이터에 기능 선택을 추가 할 수 있습니다
이것이 ** 개선 된 것으로 생각되는 ** working code ** 인 경우 [codereview.se]를 참조하십시오. 그렇지 않은 경우 [mcve]로 문제를 설명하십시오. – jonrsharpe
안녕하세요 - 일반적으로 좋은 코드입니다. 작은 팁 : 각 stdout print 문에 대해 별도의'write '를 작성할 필요가 없습니다. 당신의 삶을 편하게하기 위해'heredoc'을 찾으십시오.) 여기에 (삼중 따옴표로 묶습니다) http://lofic.github.io/tips/python-heredoc.html – javadba
나는 당신의 코드를 검토하지 않았습니다. 그냥 일반적인 제안, RandomForest를 사용하는 경우 SkLearn에서 ExtraTrees를 추가로 제공해야합니다. 그들은 랜덤 포레스트 (Random Forests)에 하나 더 많은 랜덤 레이어를 추가했으며, [Paper] (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.65.7485&rep=rep1&type=pdf) 일반적으로 더 좋습니다. –