Spark의 임의의 포리스트에서 교차 유효성 검사를 실행하려고합니다.Spark 임의의 포리스트 교차 유효성 검사 오류
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
data = nds.sc.parallelize([
LabeledPoint(0.0, [0,402,6,0]),
LabeledPoint(0.0, [3,500,3,0]),
LabeledPoint(1.0, [1,590,1,1]),
LabeledPoint(1.0, [3,328,5,0]),
LabeledPoint(1.0, [4,351,4,0]),
LabeledPoint(0.0, [2,372,2,0]),
LabeledPoint(0.0, [4,302,5,0]),
LabeledPoint(1.0, [1,387,2,0]),
LabeledPoint(1.0, [1,419,3,0]),
LabeledPoint(0.0, [1,370,5,0]),
LabeledPoint(0.0, [1,410,4,0]),
LabeledPoint(0.0, [2,509,7,1]),
LabeledPoint(0.0, [1,307,5,0]),
LabeledPoint(0.0, [0,424,4,1]),
LabeledPoint(0.0, [1,509,2,1]),
LabeledPoint(1.0, [3,361,4,0]),
])
train=data.toDF(['label','features'])
numfolds =2
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
evaluator = MulticlassClassificationEvaluator()
paramGrid = ParamGridBuilder().addGrid(rf.maxDepth,
[4,8,10]).addGrid(rf.impurity, ['entropy','gini']).addGrid(rf.featureSubsetStrategy, [6,8,10]).build()
pipeline = Pipeline(stages=[rf])
crossval = CrossValidator(
estimator=pipeline,
estimatorParamMaps=paramGrid,
evaluator=evaluator,
numFolds= numfolds)
model = crossval.fit(train)
나는 paramGrid가 목록으로 내 입력을 읽는하지 않은 것 같습니다 다음과 같은 오류
Py4JJavaError Traceback (most recent call last)
<ipython-input-87-7ea70f89086a> in <module>()
66 numFolds=num)
67
---> 68 model = crossval.fit(train)
/opt/spark/current/python/pyspark/ml/pipeline.py in fit(self, dataset, params)
67 return self.copy(params)._fit(dataset)
68 else:
---> 69 return self._fit(dataset)
70 else:
71 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
/opt/spark/current/python/pyspark/ml/tuning.py in _fit(self, dataset)
237 train = df.filter(~condition)
238 for j in range(numModels):
--> 239 model = est.fit(train, epm[j])
240 # TODO: duplicate evaluator to take extra params from input
241 metric = eva.evaluate(model.transform(validation, epm[j]))
/opt/spark/current/python/pyspark/ml/pipeline.py in fit(self, dataset, params)
65 elif isinstance(params, dict):
66 if params:
---> 67 return self.copy(params)._fit(dataset)
68 else:
69 return self._fit(dataset)
/opt/spark/current/python/pyspark/ml/pipeline.py in _fit(self, dataset)
211 dataset = stage.transform(dataset)
212 else: # must be an Estimator
--> 213 model = stage.fit(dataset)
214 transformers.append(model)
215 if i < indexOfLastEstimator:
/opt/spark/current/python/pyspark/ml/pipeline.py in fit(self, dataset, params)
67 return self.copy(params)._fit(dataset)
68 else:
---> 69 return self._fit(dataset)
70 else:
71 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
/opt/spark/current/python/pyspark/ml/wrapper.py in _fit(self, dataset)
130
131 def _fit(self, dataset):
--> 132 java_model = self._fit_java(dataset)
133 return self._create_model(java_model)
134
/opt/spark/current/python/pyspark/ml/wrapper.py in _fit_java(self, dataset)
126 :return: fitted Java model
127 """
--> 128 self._transfer_params_to_java()
129 return self._java_obj.fit(dataset._jdf)
130
/opt/spark/current/python/pyspark/ml/wrapper.py in _transfer_params_to_java(self)
80 for param in self.params:
81 if param in paramMap:
---> 82 pair = self._make_java_param_pair(param, paramMap[param])
83 self._java_obj.set(pair)
84
/opt/spark/current/python/pyspark/ml/wrapper.py in _make_java_param_pair(self, param, value)
71 java_param = self._java_obj.getParam(param.name)
72 java_value = _py2java(sc, value)
---> 73 return java_param.w(java_value)
74
75 def _transfer_params_to_java(self):
/opt/spark/current/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/opt/spark/current/python/pyspark/sql/utils.py in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/opt/spark/current/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling o1434.w.
: java.lang.ClassCastException: java.lang.Integer cannot be cast to java.lang.String
at org.apache.spark.ml.tree.RandomForestParams$$anonfun$5.apply(treeParams.scala:340)
at org.apache.spark.ml.param.Param.validate(params.scala:71)
at org.apache.spark.ml.param.ParamPair.<init>(params.scala:509)
at org.apache.spark.ml.param.Param.$minus$greater(params.scala:85)
at org.apache.spark.ml.param.Param.w(params.scala:82)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
를 얻을. 대체 형식이나 해결 방법이 있습니까? 어떤 도움을 주시면 감사하겠습니다.
rf.featureSubsetStrategy, [ 'auto', 'onethird']로 바꾸었지만 동일한 오류가 발생했습니다. 그런 다음 ParamGridBuilder에서 rf.featureSubsetStrategy, [ 'auto', 'onethird']를 제거하고 동일한 오류가 다시 발생했습니다. – mikeL
@mikeL 'ParamGrid'와 관련없는 코드에 다른 문제가 있지만이 특정 문제를 해결하면 동일한 오류가 발생하지 않을 것입니다. – zero323
예, 다른 오류입니다. 그렇습니다. 다른 문제가 있습니다. 데이터 프레임의 기능과 레이블이 잘못된 위치에있는 것 같습니다. – mikeL