Presented by Sean Glover / @randonom
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object SimpleApp {
def main(args: Array[String]) {
val logFile = "YOUR_SPARK_HOME/README.md" // Should be some file on your system
val conf = new SparkConf().setAppName("Simple Application")
val sc = new SparkContext(conf)
val logData = sc.textFile(logFile, 2).cache()
val numAs = logData.filter(line => line.contains("a")).count()
val numBs = logData.filter(line => line.contains("b")).count()
println("Lines with a: %s, Lines with b: %s".format(numAs, numBs))
}
}
$ du stackoverflow.com*/*.xml --total --block-size=G
1G stackoverflow.com-Badges/Badges.xml
8G stackoverflow.com-Comments/Comments.xml
46G stackoverflow.com-PostHistory/PostHistory.xml
1G stackoverflow.com-PostLinks/PostLinks.xml
29G stackoverflow.com-Posts/Posts.xml
1G stackoverflow.com-Tags/Tags.xml
1G stackoverflow.com-Users/Users.xml
7G stackoverflow.com-Votes/Votes.xml
90G total
libraryDependencies += "io.confluent" % "kafka-avro-serializer" % "1.0.1"
val props = new Properties()
props.put("bootstrap.servers", "localhost:9092")
props.put("schema.registry.url", "http://localhost:8081")
props.put("value.serializer", classOf[KafkaAvroSerializer].getName)
props.put("key.serializer", classOf[KafkaAvroSerializer].getName)
val producer = KafkaProducer[Object, Object](props)