작업 요약과 급여 결과를 비교하는 CSV 문서에 대한 예측을 준비 중입니다. 필자는 데이터 세트를 교육 및 테스트로 분할했습니다. 여기에서 기능은 내가 작업하는 대상이고 대상은 내가 예측 한 것입니다. 나는 인쇄로 이동하고 이러한 기록은 내가 다음과 같은 오류 얻을 제대로 분리되었는지 확인하는 경우 : ValueError를 : 일 정치 않은 모양ValueError : Scikit의 모양이 일치하지 않습니다 Train_Test Split을 배우십시오
을내 코드와 결과 오류 추적 :
import csv
import numpy as np
# create posting & label list
postList = []
labelList = []
filename = '\Users\yantezia.patrick\Downloads\Postings.csv'
csvFile = csv.reader(open(filename, 'r'), delimiter=",")
for row in csvFile:
postList.append(row[2])
labelList.append(row[10]) #appending specific columns to specific list #these willbe labels
# remove first row
postList = postList[1:] #clearing out the header rows
labelList = labelList[1:]
temp = np.array([float(i) for i in labelList])
med = np.median(temp)
for i, val in enumerate(labelList):
if float(val) >= med:
labelList[i] = 1
else:
labelList[i] = 0
# subset list
postList = postList[:100]
labelList = labelList[:100]
print postList[:2]
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
# create term matrix
cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1,3), min_df=10)
tfidf = TfidfVectorizer(lowercase=True, stop_words='english', ngram_range=(1,3), min_df=10)
tf_dm = cv.fit_transform(postList)
tfidf_dm = tfidf.fit_transform(postList)
pd.DataFrame(tfidf_dm.toarray(),index=postList,columns=tfidf.get_feature_names()).head(10)
tfidf.get_feature_names()
tm = tm.toarray()
print tf_dm
tm = cv.fit(postList)
print tm.vocabulary_
print tf_dm.shape
print tfidf_dm.shape
#add labels to word vector
from sklearn.cross_validation import train_test_split
features_train1 = train_test_split(tf_dm, labels, test_size=0.33, random_state=42)
features_test1 = train_test_split(tf_dm, labels, test_size=0.33, random_state=42)
target_train1 = train_test_split(tf_dm, labels, test_size=0.33, random_state=42)
target_test1 = train_test_split(tf_dm, labels, test_size=0.33, random_state=42)
features_train2 = train_test_split(tfidf_dm, labels, test_size=0.33, random_state=7)
features_test2 = train_test_split(tfidf_dm, labels, test_size=0.33, random_state=7)
target_train2 = train_test_split(tfidf_dm, labels, test_size=0.33, random_state=7)
target_test2 = train_test_split(tfidf_dm, labels, test_size=0.33, random_state=7)
print np.sum(target_train1)
print np.sum(target_test1)
print target_train1
print target_test1
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-82-53ecd8559f48> in <module>()
----> 1 print np.sum(target_train1)
2 print np.sum(target_test1)
3 print target_train1
4 print target_test1
C:\Users\yantezia.patrick\AppData\Local\Continuum\Anaconda\lib\site-packages\numpy\core\fromnumeric.pyc in sum(a, axis, dtype, out, keepdims)
1707 except AttributeError:
1708 return _methods._sum(a, axis=axis, dtype=dtype,
-> 1709 out=out, keepdims=keepdims)
1710 # NOTE: Dropping the keepdims parameters here...
1711 return sum(axis=axis, dtype=dtype, out=out)
C:\Users\yantezia.patrick\AppData\Local\Continuum\Anaconda\lib\site-packages\numpy\core\_methods.pyc in _sum(a, axis, dtype, out, keepdims)
23 def _sum(a, axis=None, dtype=None, out=None, keepdims=False):
24 return um.add.reduce(a, axis=axis, dtype=dtype,
---> 25 out=out, keepdims=keepdims)
26
27 def _prod(a, axis=None, dtype=None, out=None, keepdims=False):
C:\Users\yantezia.patrick\AppData\Local\Continuum\Anaconda\lib\site-packages\scipy\sparse\compressed.pyc in __add__(self, other)
340 elif isspmatrix(other):
341 if (other.shape != self.shape):
--> 342 raise ValueError("inconsistent shapes")
343
344 return self._binopt(other,'_plus_')
ValueError: inconsistent shapes