0
metastore가없는 pyspark 2.0을 사용하여 일부 ORC 파일을 읽으 려합니다. 이론적으로 데이터 스키마가 ORC 파일에 포함되어 있기 때문에 그렇게 할 수 있습니다. 하지만 여기에 내가 무엇을 가지고 있습니다 :pyspark 2.0에서 metastore가없는 ORC 파일을 읽는 방법
[[email protected] ~]$/usr/local/spark-2.0.0-bin-hadoop2.6/bin/pyspark Python 2.7.11 (default, Feb 18 2016, 13:54:48) [GCC 4.4.7 20120313 (Red Hat 4.4.7-16)] on linux2 Type "help", "copyright", "credits" or "license" for more information. Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). Welcome to ____ __ /__/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ /__/.__/\_,_/_/ /_/\_\ version 2.0.0 /_/ Using Python version 2.7.11 (default, Feb 18 2016 13:54:48) SparkSession available as 'spark'. >>> df=spark.read.orc('/my/orc/file') 16/08/21 22:29:38 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 16/08/21 22:30:00 ERROR metastore.RetryingHMSHandler: AlreadyExistsException(message:Database default already exists) at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.create_database(HiveMetaStore.java:891) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.hadoop.hive.metastore.RetryingHMSHandler.invoke(RetryingHMSHandler.java:107) at com.sun.proxy.$Proxy21.create_database(Unknown Source) at org.apache.hadoop.hive.metastore.HiveMetaStoreClient.createDatabase(HiveMetaStoreClient.java:644) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.invoke(RetryingMetaStoreClient.java:156) at com.sun.proxy.$Proxy22.createDatabase(Unknown Source) at org.apache.hadoop.hive.ql.metadata.Hive.createDatabase(Hive.java:306) at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$createDatabase$1.apply$mcV$sp(HiveClientImpl.scala:291) at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$createDatabase$1.apply(HiveClientImpl.scala:291) at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$createDatabase$1.apply(HiveClientImpl.scala:291) at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$withHiveState$1.apply(HiveClientImpl.scala:262) at org.apache.spark.sql.hive.client.HiveClientImpl.liftedTree1$1(HiveClientImpl.scala:209) at org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:208) at org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:251) at org.apache.spark.sql.hive.client.HiveClientImpl.createDatabase(HiveClientImpl.scala:290) at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$createDatabase$1.apply$mcV$sp(HiveExternalCatalog.scala:99) at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$createDatabase$1.apply(HiveExternalCatalog.scala:99) at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$createDatabase$1.apply(HiveExternalCatalog.scala:99) at org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:72) at org.apache.spark.sql.hive.HiveExternalCatalog.createDatabase(HiveExternalCatalog.scala:98) at org.apache.spark.sql.catalyst.catalog.SessionCatalog.createDatabase(SessionCatalog.scala:147) at org.apache.spark.sql.catalyst.catalog.SessionCatalog.(SessionCatalog.scala:89) at org.apache.spark.sql.hive.HiveSessionCatalog.(HiveSessionCatalog.scala:51) at org.apache.spark.sql.hive.HiveSessionState.catalog$lzycompute(HiveSessionState.scala:49) at org.apache.spark.sql.hive.HiveSessionState.catalog(HiveSessionState.scala:48) at org.apache.spark.sql.hive.HiveSessionState$$anon$1.(HiveSessionState.scala:63) at org.apache.spark.sql.hive.HiveSessionState.analyzer$lzycompute(HiveSessionState.scala:63) at org.apache.spark.sql.hive.HiveSessionState.analyzer(HiveSessionState.scala:62) at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:49) at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:64) at org.apache.spark.sql.SparkSession.baseRelationToDataFrame(SparkSession.scala:382) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:143) at org.apache.spark.sql.DataFrameReader.orc(DataFrameReader.scala:450) at org.apache.spark.sql.DataFrameReader.orc(DataFrameReader.scala:439) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:280) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:211) at java.lang.Thread.run(Thread.java:745) >>>
올바른 방법으로 ORC 파일을 읽을 수 있습니까?
ORC에만 어떤 문제가 있다고 생각하십니까? – zero323
거기에 오류가 없습니다. 'ERROR metastore.RetryingHMSHandler : AlreadyExistsException (메시지 : 데이터베이스 기본값이 이미 있음)' – abhiieor
@ zero323 : ORC와 관련이 없습니다. 파르 케 파일을 읽을 때도 똑같은 문제가있었습니다. 따라서 문제는 아마도 임베디드 스키마가있는 모든 파일과 연관되어 있습니다. 하지만 실제로 읽고 싶은 파일은 ORC 형식입니다. 나는 단지 마루와 다른 형식을 신경 쓰지 않는다. – wdkitmg