跑Hive on Spark的时候出现:

Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.NullPointerException
at org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerBigOnlyLongOperator.process(VectorMapJoinInnerBigOnlyLongOperator.java:405) ~[hive-exec-3.1.2.jar:3.1.2]
at org.apache.hadoop.hive.ql.exec.Operator.vectorForward(Operator.java:966) ~[hive-exec-3.1.2.jar:3.1.2]
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:939) ~[hive-exec-3.1.2.jar:3.1.2]
at org.apache.hadoop.hive.ql.exec.vector.VectorSelectOperator.process(VectorSelectOperator.java:158) ~[hive-exec-3.1.2.jar:3.1.2]
at org.apache.hadoop.hive.ql.exec.Operator.vectorForward(Operator.java:966) ~[hive-exec-3.1.2.jar:3.1.2]
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:939) ~[hive-exec-3.1.2.jar:3.1.2]
at org.apache.hadoop.hive.ql.exec.vector.VectorFilterOperator.process(VectorFilterOperator.java:136) ~[hive-exec-3.1.2.jar:3.1.2]
at org.apache.hadoop.hive.ql.exec.Operator.vectorForward(Operator.java:966) ~[hive-exec-3.1.2.jar:3.1.2]
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:939) ~[hive-exec-3.1.2.jar:3.1.2]
at org.apache.hadoop.hive.ql.exec.TableScanOperator.process(TableScanOperator.java:125) ~[hive-exec-3.1.2.jar:3.1.2]
at org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator.deliverVectorizedRowBatch(VectorMapOperator.java:812) ~[hive-exec-3.1.2.jar:3.1.2]
at org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator.process(VectorMapOperator.java:845) ~[hive-exec-3.1.2.jar:3.1.2]
at org.apache.hadoop.hive.ql.exec.spark.SparkMapRecordHandler.processRow(SparkMapRecordHandler.java:136) ~[hive-exec-3.1.2.jar:3.1.2]
at org.apache.hadoop.hive.ql.exec.spark.HiveMapFunctionResultList.processNextRecord(HiveMapFunctionResultList.java:48) ~[hive-exec-3.1.2.jar:3.1.2]
at org.apache.hadoop.hive.ql.exec.spark.HiveMapFunctionResultList.processNextRecord(HiveMapFunctionResultList.java:27) ~[hive-exec-3.1.2.jar:3.1.2]
at org.apache.hadoop.hive.ql.exec.spark.HiveBaseFunctionResultList.hasNext(HiveBaseFunctionResultList.java:85) ~[hive-exec-3.1.2.jar:3.1.2]
at scala.collection.convert.Wrappers$JIteratorWrapper.hasNext(Wrappers.scala:45) ~[scala-library-2.12.15.jar:?]
at org.apache.spark.shuffle.sort.UnsafeShuffleWriter.write(UnsafeShuffleWriter.java:179) ~[spark-core_2.12-3.2.1.jar:3.2.1]
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) ~[spark-core_2.12-3.2.1.jar:3.2.1]
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) ~[spark-core_2.12-3.2.1.jar:3.2.1]
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) ~[spark-core_2.12-3.2.1.jar:3.2.1]
at org.apache.spark.scheduler.Task.run(Task.scala:131) ~[spark-core_2.12-3.2.1.jar:3.2.1]
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506) ~[spark-core_2.12-3.2.1.jar:3.2.1]
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462) ~[spark-core_2.12-3.2.1.jar:3.2.1]
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509) ~[spark-core_2.12-3.2.1.jar:3.2.1]
... 3 more

背后的核心堆栈原因是因为:

java.lang.RuntimeException: Map operator initialization failed: java.lang.ClassCastException: org.apache.hadoop.hive.ql.exec.persistence.HashMapWrapper cannot be cast to org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerDirectAccess
at org.apache.hadoop.hive.ql.exec.spark.SparkMapRecordHandler.init(SparkMapRecordHandler.java:118)
at org.apache.hadoop.hive.ql.exec.spark.HiveMapFunction.call(HiveMapFunction.java:55)
at org.apache.hadoop.hive.ql.exec.spark.HiveMapFunction.call(HiveMapFunction.java:30)
at org.apache.spark.api.java.JavaRDDLike$$anonfun$fn$7$1.apply(JavaRDDLike.scala:186)
at org.apache.spark.api.java.JavaRDDLike$$anonfun$fn$7$1.apply(JavaRDDLike.scala:186)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassCastException: org.apache.hadoop.hive.ql.exec.persistence.HashMapWrapper cannot be cast to org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerDirectAccess
at org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized.VectorMapJoinOptimizedHashTable.(VectorMapJoinOptimizedHashTable.java:92)
at org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized.VectorMapJoinOptimizedHashMultiSet.(VectorMapJoinOptimizedHashMultiSet.java:101)
at org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized.VectorMapJoinOptimizedStringHashMultiSet.(VectorMapJoinOptimizedStringHashMultiSet.java:61)
at org.apache.hadoop.hive.ql.exec.vector.mapjoin.optimized.VectorMapJoinOptimizedCreateHashTable.createHashTable(VectorMapJoinOptimizedCreateHashTable.java:85)
at org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinCommonOperator.setUpHashTable(VectorMapJoinCommonOperator.java:483)
at org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinCommonOperator.completeInitializationOp(VectorMapJoinCommonOperator.java:461)
at org.apache.hadoop.hive.ql.exec.Operator.completeInitialization(Operator.java:470)
at org.apache.hadoop.hive.ql.exec.Operator.initialize(Operator.java:400)
at org.apache.hadoop.hive.ql.exec.Operator.initialize(Operator.java:573)
at org.apache.hadoop.hive.ql.exec.Operator.initializeChildren(Operator.java:525)
at org.apache.hadoop.hive.ql.exec.Operator.initialize(Operator.java:386)
at org.apache.hadoop.hive.ql.exec.spark.SparkMapRecordHandler.init(SparkMapRecordHandler.java:109)
... 18 more

看他的代码是在:src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinCommonOperator.java

private void setUpHashTable() {

HashTableImplementationType hashTableImplementationType = vectorDesc.getHashTableImplementationType();
switch (vectorDesc.getHashTableImplementationType()) {
case OPTIMIZED:
{
// Create our vector map join optimized hash table variation *above* the
// map join table container.
vectorMapJoinHashTable = VectorMapJoinOptimizedCreateHashTable.createHashTable(conf,
mapJoinTables[posSingleVectorMapJoinSmallTable]);
}
break;

case FAST:
{
// Get our vector map join fast hash table variation from the
// vector map join table container.
VectorMapJoinTableContainer vectorMapJoinTableContainer =
(VectorMapJoinTableContainer) mapJoinTables[posSingleVectorMapJoinSmallTable];
vectorMapJoinHashTable = vectorMapJoinTableContainer.vectorMapJoinHashTable();
}
break;
default:
throw new RuntimeException("Unknown vector map join hash table implementation type " + hashTableImplementationType.name());
}
LOG.info("Using " + vectorMapJoinHashTable.getClass().getSimpleName() + " from " + this.getClass().getSimpleName());
}

也就是分支进入FAST就会出现类型问题,再追了一下参数,通过:

set hive.mapjoin.optimized.hashtable=false;

让它不进入这个分支,也就是关闭hashtable即可。


扫码手机观看或分享: