defmain(args: Array[String]): Unit = { val in = "file:///home/hadoop/data/site.log" //连接SparkMaster val conf = newSparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[2]") val sc = newSparkContext(conf)
val fileRDD = sc.textFile(in)
val mapRDD = fileRDD.map(lines => { val words = lines.split("\t") ((words(0), words(1)), 1) //((domain,url),1) })
val result = mapRDD.reduceByKey(_ + _).groupBy(x => x._1._1).mapValues( x=> x.toList.sortBy(x => -x._2).map(x => (x._1._1,x._1._2,x._2)).take(2)) result.foreach(println) }
defmain(args: Array[String]): Unit = { val in = "tunan-spark-core/data/site.log" //连接SparkMaster val conf = newSparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[2]") val sc = newSparkContext(conf)
val fileRDD = sc.textFile(in)
val mapRDD = fileRDD.map(lines => { val words = lines.split("\t") ((words(0), words(1)), 1) })
val domains = Array("www.google.com", "www.ruozedata.com", "www.baidu.com")
defmain(args: Array[String]): Unit = { val in = "tunan-spark-core/data/site.log" //连接SparkMaster val conf = newSparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[2]") val sc = newSparkContext(conf)
val fileRDD = sc.textFile(in)
val mapRDD = fileRDD.map(lines => { val words = lines.split("\t") ((words(0), words(1)), 1) })
val domains = mapRDD.map(x => x._1._1).distinct().collect()
for (domain <- domains){ mapRDD.filter( x => domain.equals(x._1._1)).reduceByKey(_+_).sortBy(x => -x._2).take(2).foreach(println) } }
defmain(args: Array[String]): Unit = { val in = "tunan-spark-core/data/site.log" //连接SparkMaster val conf = newSparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[2]") val sc = newSparkContext(conf)
val fileRDD = sc.textFile(in)
val mapRDD = fileRDD.map(lines => { val words = lines.split("\t") ((words(0), words(1)), 1) })
val domains = mapRDD.map(x => x._1._1).distinct().collect()
defmain(args: Array[String]): Unit = { val in = "tunan-spark-core/data/site.log" //连接SparkMaster val conf = newSparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[2]") val sc = newSparkContext(conf)
val fileRDD = sc.textFile(in)
val mapRDD = fileRDD.map(lines => { val words = lines.split("\t") ((words(0), words(1)), 1) })
val domains = mapRDD.map(x => x._1._1).distinct().collect()