collaborative filtering - Spark ALS with Sparse Implicit Dataset -
i trying run movieals example spark implicit dataset , receiving error:
got 3856988 ratings 144250 users on 378937 movies. training: 3085522, test: 771466. 15/07/13 10:43:07 warn blas: failed load implementation from: com.github.fommil.netlib.nativesystemblas 15/07/13 10:43:07 warn blas: failed load implementation from: com.github.fommil.netlib.nativerefblas 15/07/13 10:43:10 warn tasksetmanager: lost task 3.0 in stage 29.0 (tid 192, 10.162.45.33): java.lang.assertionerror: assertion failed: lapack.dppsv returned 1. @ scala.predef$.assert(predef.scala:179) @ org.apache.spark.ml.recommendation.als$choleskysolver.solve(als.scala:386) @ org.apache.spark.ml.recommendation.als$$anonfun$org$apache$spark$ml$recommendation$als$$computefactors$1.apply(als.scala:1163) @ org.apache.spark.ml.recommendation.als$$anonfun$org$apache$spark$ml$recommendation$als$$computefactors$1.apply(als.scala:1124) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$mapvalues$1$$anonfun$apply$41$$anonfun$apply$42.apply(pairrddfunctions.scala:700) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$mapvalues$1$$anonfun$apply$41$$anonfun$apply$42.apply(pairrddfunctions.scala:700) @ scala.collection.iterator$$anon$11.next(iterator.scala:328) @ org.apache.spark.storage.memorystore.unrollsafely(memorystore.scala:277) @ org.apache.spark.cachemanager.putinblockmanager(cachemanager.scala:171) @ org.apache.spark.cachemanager.getorcompute(cachemanager.scala:78) @ org.apache.spark.rdd.rdd.iterator(rdd.scala:242) @ org.apache.spark.rdd.mappartitionsrdd.compute(mappartitionsrdd.scala:35) @ org.apache.spark.rdd.rdd.computeorreadcheckpoint(rdd.scala:277) @ org.apache.spark.rdd.rdd.iterator(rdd.scala:244) @ org.apache.spark.scheduler.resulttask.runtask(resulttask.scala:63) @ org.apache.spark.scheduler.task.run(task.scala:70) @ org.apache.spark.executor.executor$taskrunner.run(executor.scala:213) @ java.util.concurrent.threadpoolexecutor.runworker(threadpoolexecutor.java:1145) @ java.util.concurrent.threadpoolexecutor$worker.run(threadpoolexecutor.java:615) @ java.lang.thread.run(thread.java:745) 15/07/13 10:43:10 error tasksetmanager: task 12 in stage 29.0 failed 4 times; aborting job exception in thread "main" org.apache.spark.sparkexception: job aborted due stage failure: task 12 in stage 29.0 failed 4 times, recent failure: lost task 12.3 in stage 29.0 (tid 249, 10.162.45.33): java.lang.assertionerror: assertion failed: lapack.dppsv returned 1. @ scala.predef$.assert(predef.scala:179) @ org.apache.spark.ml.recommendation.als$choleskysolver.solve(als.scala:386) @ org.apache.spark.ml.recommendation.als$$anonfun$org$apache$spark$ml$recommendation$als$$computefactors$1.apply(als.scala:1163) @ org.apache.spark.ml.recommendation.als$$anonfun$org$apache$spark$ml$recommendation$als$$computefactors$1.apply(als.scala:1124) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$mapvalues$1$$anonfun$apply$41$$anonfun$apply$42.apply(pairrddfunctions.scala:700) @ org.apache.spark.rdd.pairrddfunctions$$anonfun$mapvalues$1$$anonfun$apply$41$$anonfun$apply$42.apply(pairrddfunctions.scala:700) @ scala.collection.iterator$$anon$11.next(iterator.scala:328) @ org.apache.spark.storage.memorystore.unrollsafely(memorystore.scala:277) @ org.apache.spark.cachemanager.putinblockmanager(cachemanager.scala:171) @ org.apache.spark.cachemanager.getorcompute(cachemanager.scala:78) @ org.apache.spark.rdd.rdd.iterator(rdd.scala:242) @ org.apache.spark.rdd.mappartitionsrdd.compute(mappartitionsrdd.scala:35) @ org.apache.spark.rdd.rdd.computeorreadcheckpoint(rdd.scala:277) @ org.apache.spark.rdd.rdd.iterator(rdd.scala:244) @ org.apache.spark.scheduler.resulttask.runtask(resulttask.scala:63) @ org.apache.spark.scheduler.task.run(task.scala:70) @ org.apache.spark.executor.executor$taskrunner.run(executor.scala:213) @ java.util.concurrent.threadpoolexecutor.runworker(threadpoolexecutor.java:1145) @ java.util.concurrent.threadpoolexecutor$worker.run(threadpoolexecutor.java:615) @ java.lang.thread.run(thread.java:745) driver stacktrace: @ org.apache.spark.scheduler.dagscheduler.org$apache$spark$scheduler$dagscheduler$$failjobandindependentstages(dagscheduler.scala:1266) @ org.apache.spark.scheduler.dagscheduler$$anonfun$abortstage$1.apply(dagscheduler.scala:1257) @ org.apache.spark.scheduler.dagscheduler$$anonfun$abortstage$1.apply(dagscheduler.scala:1256) @ scala.collection.mutable.resizablearray$class.foreach(resizablearray.scala:59) @ scala.collection.mutable.arraybuffer.foreach(arraybuffer.scala:47) @ org.apache.spark.scheduler.dagscheduler.abortstage(dagscheduler.scala:1256) @ org.apache.spark.scheduler.dagscheduler$$anonfun$handletasksetfailed$1.apply(dagscheduler.scala:730) @ org.apache.spark.scheduler.dagscheduler$$anonfun$handletasksetfailed$1.apply(dagscheduler.scala:730) @ scala.option.foreach(option.scala:236) @ org.apache.spark.scheduler.dagscheduler.handletasksetfailed(dagscheduler.scala:730) @ org.apache.spark.scheduler.dagschedulereventprocessloop.onreceive(dagscheduler.scala:1450) @ org.apache.spark.scheduler.dagschedulereventprocessloop.onreceive(dagscheduler.scala:1411) @ org.apache.spark.util.eventloop$$anon$1.run(eventloop.scala:48)
i unsure if due sparsity of dataset. works fine when trained explicitly. however, since dataset binary, error rate obtained explicit als model not accurate.
would possible me out? thank you, ben
Comments
Post a Comment