本文基于spark-1.6.2-bin-hadoop2.6
提交到本地
程序中指定的参数(param)和spark-submit提交时参数配置一致:
import org.apache.spark.deploy.SparkSubmit;public class Dr { public static void main(String[] args) { String[] param = new String[]{ "--name", "app_name", "--master", "local[*]", "--class", "Tkmeans", "/mnt/tkmeans_2.10-1.0.jar" }; SparkSubmit.main(param); }}
提交到YARN
用spark-submit提交任务到YARN集群,只需要HADOOP_CONF_DIR
环境变量指向YARN的配置文件目录就好。
用程序提交虽然也要求指定HADOOP_CONF_DIR
环境变量,但指定了却并不起作用,需要在程序里指定yarn配置:
import org.apache.hadoop.conf.Configuration;import org.apache.spark.SparkConf;import org.apache.spark.deploy.yarn.Client;import org.apache.spark.deploy.yarn.ClientArguments;public class Test { public static void main(String[] args) { String[] param = new String[] { "--name", "test java submit job to yarn", "--class", "Tkmeans", // "--executor-memory","1G", // "--arg","hdfs://node101:8020/user/root/log.txt", // "--arg","hdfs://node101:8020/user/root/badLines_yarn_", "--jar", "/mnt/tkmeans_2.10-1.0.jar" }; Configuration conf = new Configuration(); String os = System.getProperty("os.name"); boolean cross_platform = false; if (os.contains("Windows")) { cross_platform = true; } // 配置使用跨平台提交任务 conf.setBoolean("mapreduce.app-submission.cross-platform", cross_platform); conf.set("fs.defaultFS", "hdfs://data60:9000"); // 指定namenode conf.set("mapreduce.framework.name", "yarn"); // 指定使用yarn框架 conf.set("yarn.resourcemanager.address", "data60:8032"); // 指定resourcemanager conf.set("yarn.resourcemanager.scheduler.address", "data60:8030"); // 指定资源分配器 conf.set("mapreduce.jobhistory.address", "data60:10020"); System.setProperty("SPARK_YARN_MODE", "true"); // 防止每次提交任务都上传此jar,只能指定1个jar文件而不能批量指定或指定目录。 // 可以通过指定SPARK_JAR环境变量,但此种方式已经弃用。 System.setProperty("spark.yarn.jar", "hdfs:///jars/spark-assembly-1.6.2-hadoop2.6.0.jar"); SparkConf sparkConf = new SparkConf(); ClientArguments cArgs = new ClientArguments(param, sparkConf); Client client = new Client(cArgs, conf, sparkConf); client.run(); // 这种提交方式无法反馈任务状态 }}