2016-06-26 3 views
2

Spark의 임의의 포리스트에서 교차 유효성 검사를 실행하려고합니다.Spark 임의의 포리스트 교차 유효성 검사 오류

from pyspark.ml import Pipeline 
from pyspark.ml.classification import RandomForestClassifier 
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 

data = nds.sc.parallelize([ 
LabeledPoint(0.0, [0,402,6,0]), 
LabeledPoint(0.0, [3,500,3,0]), 
LabeledPoint(1.0, [1,590,1,1]), 
LabeledPoint(1.0, [3,328,5,0]), 
LabeledPoint(1.0, [4,351,4,0]), 
LabeledPoint(0.0, [2,372,2,0]), 
LabeledPoint(0.0, [4,302,5,0]), 
LabeledPoint(1.0, [1,387,2,0]), 
LabeledPoint(1.0, [1,419,3,0]), 
LabeledPoint(0.0, [1,370,5,0]), 
LabeledPoint(0.0, [1,410,4,0]), 
LabeledPoint(0.0, [2,509,7,1]), 
LabeledPoint(0.0, [1,307,5,0]), 
LabeledPoint(0.0, [0,424,4,1]), 
LabeledPoint(0.0, [1,509,2,1]), 
LabeledPoint(1.0, [3,361,4,0]), 
]) 


train=data.toDF(['label','features']) 

numfolds =2 

rf = RandomForestClassifier(labelCol="label", featuresCol="features") 
evaluator = MulticlassClassificationEvaluator() 


paramGrid = ParamGridBuilder().addGrid(rf.maxDepth,  
[4,8,10]).addGrid(rf.impurity, ['entropy','gini']).addGrid(rf.featureSubsetStrategy, [6,8,10]).build() 

pipeline = Pipeline(stages=[rf]) 

crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=evaluator, 
    numFolds= numfolds) 

model = crossval.fit(train) 

나는 paramGrid가 목록으로 내 입력을 읽는하지 않은 것 같습니다 다음과 같은 오류

Py4JJavaError        Traceback (most recent call last) 
<ipython-input-87-7ea70f89086a> in <module>() 
66  numFolds=num) 
67 
---> 68 model = crossval.fit(train) 

/opt/spark/current/python/pyspark/ml/pipeline.py in fit(self, dataset, params) 
67     return self.copy(params)._fit(dataset) 
68    else: 
---> 69     return self._fit(dataset) 
70   else: 
71    raise ValueError("Params must be either a param map or a list/tuple of param maps, " 

/opt/spark/current/python/pyspark/ml/tuning.py in _fit(self, dataset) 
237    train = df.filter(~condition) 
238    for j in range(numModels): 
--> 239     model = est.fit(train, epm[j]) 
240     # TODO: duplicate evaluator to take extra params from input 
241     metric = eva.evaluate(model.transform(validation, epm[j])) 

/opt/spark/current/python/pyspark/ml/pipeline.py in fit(self, dataset, params) 
65   elif isinstance(params, dict): 
66    if params: 
---> 67     return self.copy(params)._fit(dataset) 
68    else: 
69     return self._fit(dataset) 

/opt/spark/current/python/pyspark/ml/pipeline.py in _fit(self, dataset) 
211      dataset = stage.transform(dataset) 
212     else: # must be an Estimator 
--> 213      model = stage.fit(dataset) 
214      transformers.append(model) 
215      if i < indexOfLastEstimator: 

/opt/spark/current/python/pyspark/ml/pipeline.py in fit(self, dataset, params) 
67     return self.copy(params)._fit(dataset) 
68    else: 
---> 69     return self._fit(dataset) 
70   else: 
71    raise ValueError("Params must be either a param map or a list/tuple of param maps, " 

/opt/spark/current/python/pyspark/ml/wrapper.py in _fit(self, dataset) 
130 
131  def _fit(self, dataset): 
--> 132   java_model = self._fit_java(dataset) 
133   return self._create_model(java_model) 
134 

/opt/spark/current/python/pyspark/ml/wrapper.py in _fit_java(self, dataset) 
126   :return: fitted Java model 
127   """ 
--> 128   self._transfer_params_to_java() 
129   return self._java_obj.fit(dataset._jdf) 
130 

/opt/spark/current/python/pyspark/ml/wrapper.py in _transfer_params_to_java(self) 
80   for param in self.params: 
81    if param in paramMap: 
---> 82     pair = self._make_java_param_pair(param, paramMap[param]) 
83     self._java_obj.set(pair) 
84 

/opt/spark/current/python/pyspark/ml/wrapper.py in _make_java_param_pair(self, param, value) 
71   java_param = self._java_obj.getParam(param.name) 
72   java_value = _py2java(sc, value) 
---> 73   return java_param.w(java_value) 
74 
75  def _transfer_params_to_java(self): 

    /opt/spark/current/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args) 
811   answer = self.gateway_client.send_command(command) 
812   return_value = get_return_value(
    --> 813    answer, self.gateway_client, self.target_id, self.name) 
814 
815   for temp_arg in temp_args: 

/opt/spark/current/python/pyspark/sql/utils.py in deco(*a, **kw) 
43  def deco(*a, **kw): 
44   try: 
    ---> 45    return f(*a, **kw) 
46   except py4j.protocol.Py4JJavaError as e: 
47    s = e.java_exception.toString() 

/opt/spark/current/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 
306     raise Py4JJavaError(
307      "An error occurred while calling {0}{1}{2}.\n". 
--> 308      format(target_id, ".", name), value) 
309    else: 
310     raise Py4JError(

Py4JJavaError: An error occurred while calling o1434.w. 
: java.lang.ClassCastException: java.lang.Integer cannot be cast to  java.lang.String 
at org.apache.spark.ml.tree.RandomForestParams$$anonfun$5.apply(treeParams.scala:340) 
at org.apache.spark.ml.param.Param.validate(params.scala:71) 
at org.apache.spark.ml.param.ParamPair.<init>(params.scala:509) 
at org.apache.spark.ml.param.Param.$minus$greater(params.scala:85) 
at org.apache.spark.ml.param.Param.w(params.scala:82) 
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) 
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) 
at java.lang.reflect.Method.invoke(Method.java:497) 
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) 
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381) 
at py4j.Gateway.invoke(Gateway.java:259) 
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) 
at py4j.commands.CallCommand.execute(CallCommand.java:79) 
at py4j.GatewayConnection.run(GatewayConnection.java:209) 
at java.lang.Thread.run(Thread.java:745) 

를 얻을. 대체 형식이나 해결 방법이 있습니까? 어떤 도움을 주시면 감사하겠습니다.

답변

2

rf.featureSubsetStrategy에 잘못된 값을 전달합니다. 전략을 설명하는 문자열이어야하며 auto, all, onethird, sqrt, log2 값을 지원합니다. 참조 : RandomForestClassifier.featureSubsetStrategy.doc.

data.toDF(['label','features'])도 사용하지 마십시오. 올바른 순서를 유지하지 않습니다. 사용 :

data.toDF() 

또는 이름을 수정하려면 :

from operator import attrgetter 

data.map(attrgetter("label", "features")).toDF(["some_name", "some_other_name"]) 

마지막으로 라벨 열을 인덱싱하는 또는 필요한 메타 데이터를 제공해야합니다. How can I declare a Column as a categorical feature in a DataFrame for use in ml

+0

rf.featureSubsetStrategy, [ 'auto', 'onethird']로 바꾸었지만 동일한 오류가 발생했습니다. 그런 다음 ParamGridBuilder에서 rf.featureSubsetStrategy, [ 'auto', 'onethird']를 제거하고 동일한 오류가 다시 발생했습니다. – mikeL

+0

@mikeL 'ParamGrid'와 관련없는 코드에 다른 문제가 있지만이 특정 문제를 해결하면 동일한 오류가 발생하지 않을 것입니다. – zero323

+0

예, 다른 오류입니다. 그렇습니다. 다른 문제가 있습니다. 데이터 프레임의 기능과 레이블이 잘못된 위치에있는 것 같습니다. – mikeL

관련 문제