Python 임의의 포리스트 및 기계 학습 - 개선

-2

기계 학습을 위해 Python을 사용하는 것이 매우 새로운 개념입니다. 파이썬은 포트란에서 프로그래밍의 배경에서 온 것입니다. 그래서 당신이 상상할 수 있듯이, 파이썬은 꽤 도약입니다. 저는 화학 분야에서 일하며 화학적 방법 (화학에 데이터 과학 기술 적용)에 관여했습니다. 따라서 비단뱀의 광범위한 기계 학습 라이브러리의 응용이 중요합니다. 나는 또한 코드가 효율적이어야한다. 나는 실행되고 OK로 작동하는 코드를 작성했습니다. 내가 알고 싶은 것은 다음과 같습니다 :Python 임의의 포리스트 및 기계 학습 - 개선

1 어떻게 개선하고/효율적으로 만드는지.

2 내가 사용했던 대안에 대한 제안이나 가능하다면 왜 다른 경로가 더 우수한 이유일까요?

나는 지속적인 데이터 및 회귀 모델로 작업하는 경향이 있습니다.

제안 사항이 좋을 수 있으며 사전에 감사드립니다.

import scipy 
import math 
import numpy as np 
import pandas as pd 
import plotly.plotly as py 
import os.path 
import sys 

from time import time 
from sklearn import preprocessing, metrics, cross_validation 
from sklearn.cross_validation import train_test_split 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.grid_search import GridSearchCV 
from sklearn.cross_validation import KFold 

fname = str(raw_input('Please enter the input file name containing total dataset and descriptors (assumes csv file, column headings and first column are labels\n')) 
if os.path.isfile(fname) : 
    SubFeAll = pd.read_csv(fname, sep=",") 
else: 
    sys.exit("ERROR: input file does not exist") 

#SubFeAll = pd.read_csv(fname, sep=",") 
SubFeAll = SubFeAll.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor 
header = SubFeAll.columns.values # Use the column headers as the descriptor labels 
SubFeAll.head() 

# Set the numpy global random number seed (similar effect to random_state) 
np.random.seed(1) 

# Random Forest results initialised 
RFr2 = [] 
RFmse = [] 
RFrmse = [] 

# Predictions results initialised 
RFpredictions = [] 

metcount = 0 

# Give the array from pandas to numpy 
npArray = np.array(SubFeAll) 
print header.shape 
npheader = np.array(header[1:-1]) 
print("Array shape X = %d, Y = %d " % (npArray.shape)) 
datax, datay = npArray.shape 

# Print specific nparray values to check the data 
print("The first element of the input data set, as a minial check please ensure this is as expected = %s" % npArray[0,0]) 

# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point 
names = npArray[:,0] 
X = npArray[:,1:-1].astype(float) 
y = npArray[:,-1] .astype(float) 
X = preprocessing.scale(X) 
print X.shape 

# Open output files 
train_name = "Training.csv" 
test_name = "Predictions.csv" 
fi_name = "Feature_importance.csv" 

with open(train_name,'w') as ftrain, open(test_name,'w') as fpred, open(fi_name,'w') as ffeatimp: 
     ftrain.write("This file contains the training information for the Random Forest models\n") 
     ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n") 
     ftrain.write("Interation %d ,\n" %(metcount+1)) 

     fpred.write("This file contains the prediction information for the Random Forest models\n") 
     fpred.write("Predictions are made over a ten fold cross validation hence training on 90% test on 10%. The final prediction are return iteratively over this ten fold cros validation once,\n") 
     fpred.write("optimised parameters are located via a grid search at each fold,\n") 
     fpred.write("Interation %d ,\n" %(metcount+1)) 

     ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n") 
     ffeatimp.write("Interation %d ,\n" %(metcount+1)) 

     # Begin the K-fold cross validation over ten folds 
     kf = KFold(datax, n_folds=10, shuffle=True, random_state=0) 
     print "------------------- Begining Ten Fold Cross Validation -------------------" 
     for train, test in kf: 
      XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test] 
      ytestdim = yTest.shape[0] 
       print("The test set values are : ") 
       i = 0 
       if ytestdim%5 == 0: 
         while i < ytestdim: 
           print round(yTest[i],2),'\t', round(yTest[i+1],2),'\t', round(yTest[i+2],2),'\t', round(yTest[i+3],2),'\t', round(yTest[i+4],2) 
           ftrain.write(str(round(yTest[i],2))+','+ str(round(yTest[i+1],2))+','+str(round(yTest[i+2],2))+','+str(round(yTest[i+3],2))+','+str(round(yTest[i+4],2))+',\n') 
           i += 5 
       elif ytestdim%4 == 0: 
         while i < ytestdim: 
           print round(yTest[i],2),'\t', round(yTest[i+1],2),'\t', round(yTest[i+2],2),'\t', round(yTest[i+3],2) 
           ftrain.write(str(round(yTest[i],2))+','+str(round(yTest[i+1],2))+','+str(round(yTest[i+2],2))+','+str(round(yTest[i+3],2))+',\n') 
           i += 4 
       elif ytestdim%3 == 0 : 
         while i < ytestdim : 
           print round(yTest[i],2),'\t', round(yTest[i+1],2),'\t', round(yTest[i+2],2) 
           ftrain.write(str(round(yTest[i],2))+','+str(round(yTest[i+1],2))+','+str(round(yTest[i+2],2))+',\n') 
           i += 3 
       elif ytestdim%2 == 0 : 
         while i < ytestdim : 
           print round(yTest[i],2), '\t', round(yTest[i+1],2) 
           ftrain.write(str(round(yTest[i],2))+','+str(round(yTest[i+1],2))+',\n') 
           i += 2 
         else : 
           while i< ytestdim : 
             print round(yTest[i],2) 
             ftrain.write(str(round(yTest[i],2))+',\n') 
             i += 1   

       print "\n" 
       # random forest grid search parameters 
      print "------------------- Begining Random Forest Grid Search -------------------" 
       rfparamgrid = {"n_estimators": [10], "max_features": ["auto", "sqrt", "log2"], "max_depth": [5,7]} 
       rf = RandomForestRegressor(random_state=0,n_jobs=2) 
       RfGridSearch = GridSearchCV(rf,param_grid=rfparamgrid,scoring='mean_squared_error',cv=10) 
       start = time() 
       RfGridSearch.fit(XTrain,yTrain) 

       # Get best random forest parameters 
       print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_))) 
       RFtime = time() - start,len(RfGridSearch.grid_scores_) 
       #print(RfGridSearch.grid_scores_) # Diagnos 
       print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators']) 
       ne = RfGridSearch.best_params_['n_estimators'] 
       print("max_features = %s " % RfGridSearch.best_params_['max_features']) 
       mf = RfGridSearch.best_params_['max_features'] 
       print("max_depth = %d " % RfGridSearch.best_params_['max_depth']) 
       md = RfGridSearch.best_params_['max_depth'] 

       ftrain.write("Random Forest") 
       ftrain.write("RF search time, %s ,\n" % (str(RFtime))) 
       ftrain.write("Number of Trees, %s ,\n" % str(ne)) 
       ftrain.write("Number of feature at split, %s ,\n" % str(mf)) 
       ftrain.write("Max depth of tree, %s ,\n" % str(md)) 

       # Train random forest and predict with optimised parameters 
       print("\n\n------------------- Starting opitimised RF training -------------------") 
       optRF = RandomForestRegressor(n_estimators = ne, max_features = mf, max_depth = md, random_state=0) 
       optRF.fit(XTrain, yTrain)  # Train the model 
       RFfeatimp = optRF.feature_importances_ 
       indices = np.argsort(RFfeatimp)[::-1] 
       print("Training R2 = %5.2f" % optRF.score(XTrain,yTrain)) 
       print("Starting optimised RF prediction") 
       RFpreds = optRF.predict(XTest) 
       print("The predicted values now follow :") 
       RFpredsdim = RFpreds.shape[0] 
       i = 0 
       if RFpredsdim%5 == 0: 
         while i < RFpredsdim: 
           print round(RFpreds[i],2),'\t', round(RFpreds[i+1],2),'\t', round(RFpreds[i+2],2),'\t', round(RFpreds[i+3],2),'\t', round(RFpreds[i+4],2) 
           i += 5 
       elif RFpredsdim%4 == 0: 
         while i < RFpredsdim: 
           print round(RFpreds[i],2),'\t', round(RFpreds[i+1],2),'\t', round(RFpreds[i+2],2),'\t', round(RFpreds[i+3],2) 
           i += 4 
       elif RFpredsdim%3 == 0 : 
         while i < RFpredsdim : 
           print round(RFpreds[i],2),'\t', round(RFpreds[i+1],2),'\t', round(RFpreds[i+2],2) 
           i += 3 
       elif RFpredsdim%2 == 0 : 
         while i < RFpredsdim : 
           print round(RFpreds[i],2), '\t', round(RFpreds[i+1],2) 
           i += 2 
       else : 
         while i< RFpredsdim : 
           print round(RFpreds[i],2) 
       i += 1 
       print "\n" 
       RFr2.append(optRF.score(XTest, yTest)) 
       RFmse.append(metrics.mean_squared_error(yTest,RFpreds)) 
       RFrmse.append(math.sqrt(RFmse[metcount])) 
       print ("Random Forest prediction statistics for fold %d are; MSE = %5.2f RMSE = %5.2f R2 = %5.2f\n\n" % (metcount+1, RFmse[metcount], RFrmse[metcount],RFr2[metcount])) 

       ftrain.write("Random Forest prediction statistics for fold %d are, MSE =, %5.2f, RMSE =, %5.2f, R2 =, %5.2f,\n\n" % (metcount+1, RFmse[metcount], RFrmse[metcount],RFr2[metcount])) 



       ffeatimp.write("Feature importance rankings from random forest,\n") 
       for i in range(RFfeatimp.shape[0]) : 
         ffeatimp.write("%d. , feature %d , %s, (%f),\n" % (i + 1, indices[i], npheader[indices[i]], RFfeatimp[indices[i]])) 


       # Store prediction in original order of data (itest) whilst following through the current test set order (j) 
      metcount += 1 

       ftrain.write("Fold %d, \n" %(metcount)) 

      print "------------------- Next Fold %d -------------------" %(metcount+1) 
      j = 0 
      for itest in test : 
       RFpredictions.append(RFpreds[j]) 
       j += 1 


     lennames = names.shape[0] 
     lenpredictions = len(RFpredictions) 
     lentrue = y.shape[0] 
     if lennames == lenpredictions == lentrue : 
       fpred.write("Names/Label,, Prediction Random Forest,, True Value,\n") 
       for i in range(0,lennames) : 
         fpred.write(str(names[i])+",,"+str(RFpredictions[i])+",,"+str(y[i])+",\n") 
     else : 
       fpred.write("ERROR - names, prediction and true value array size mismatch. Dumping arrays for manual inspection in predictions.csv\n") 
       fpred.write("Array printed in the order names/Labels, predictions RF and true values\n") 
       fpred.write(names+"\n") 
       fpred.write(RFpredictions+"\n") 
       fpred.write(y+"\n") 
       sys.exit("ERROR - names, prediction and true value array size mismatch. Dumping arrays for manual inspection in predictions.csv") 

     print "Final averaged Random Forest metrics : " 
     RFamse = sum(RFmse)/10 
     RFmse_sd = np.std(RFmse) 
     RFarmse = sum(RFrmse)/10 
     RFrmse_sd = np.std(RFrmse) 
     RFslope, RFintercept, RFr_value, RFp_value, RFstd_err = scipy.stats.linregress(RFpredictions, y) 
     RFR2 = RFr_value**2 
     print "Average Mean Squared Error = ", RFamse, " +/- ", RFmse_sd 
     print "Average Root Mean Squared Error = ", RFarmse, " +/- ", RFrmse_sd 
     print "R2 Final prediction against True values = ", RFR2 

     fpred.write("\n") 
     fpred.write("FINAL PREDICTION STATISTICS,\n") 
     fpred.write("Random Forest average MSE, %s, +/-, %s,\n" %(str(RFamse), str(RFmse_sd))) 
     fpred.write("Random Forest average RMSE, %s, +/-, %s,\n" %(str(RFarmse), str(RFrmse_sd))) 
    fpred.write("Random Forest slope, %s, Random Forest intercept, %s,\n" %(str(RFslope), str(RFintercept))) 
     fpred.write("Random Forest standard error, %s,\n" %(str(RFstd_err))) 
    fpred.write("Random Forest R, %s,\n" %(str(RFr_value))) 
     fpred.write("Random Forest R2, %s,\n" %(str(RFR2))) 

ftrain.close() 
fpred.close() 
ffeatimp.close()

당신은 또한 데이터에 기능 선택을 추가 할 수 있습니다

출처

2016-06-19 James

이것이 ** 개선 된 것으로 생각되는 ** working code ** 인 경우 [codereview.se]를 참조하십시오. 그렇지 않은 경우 [mcve]로 문제를 설명하십시오. – jonrsharpe

안녕하세요 - 일반적으로 좋은 코드입니다. 작은 팁 : 각 stdout print 문에 대해 별도의'write '를 작성할 필요가 없습니다. 당신의 삶을 편하게하기 위해'heredoc'을 찾으십시오.) 여기에 (삼중 따옴표로 묶습니다) http://lofic.github.io/tips/python-heredoc.html – javadba

나는 당신의 코드를 검토하지 않았습니다. 그냥 일반적인 제안, RandomForest를 사용하는 경우 SkLearn에서 ExtraTrees를 추가로 제공해야합니다. 그들은 랜덤 포레스트 (Random Forests)에 하나 더 많은 랜덤 레이어를 추가했으며, [Paper] (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.65.7485&rep=rep1&type=pdf) 일반적으로 더 좋습니다. –

sickit learn feature selection

기술을 배우고 당신이 당신의 DM 프로젝트의 일부 측면을 개선하는 데 사용할 수 있습니다 sickit에서 제공하는 일부 기능 선택

출처

2016-06-19 17:05:50 Masoud

의견을 보내 주셔서 감사합니다. – James

Python 임의의 포리스트 및 기계 학습 - 개선

답변

관련 문제