目录
-
- 一、Json文件日志
-
- 1.1 清洗数据
- 1.2 rdd转dataFrame格式
- 1.3 提取'cm'中信息
- 1.4 提取'event'中信息
- 1.5 按'en'中的类型分类生成表
- 1.6 将生成的表保存到hive库
- 1.7 查看hive中的表
- 二、传统格式日志
-
- 2.1 数据清洗
- 2.2 拆分url
- 2.3 数据保存到MySQL库
- 2.4 数据分析
点击下载样例文件
提取码: vjex
一、Json文件日志
使用数据 op.log文件
注:spark-shell界面操作
1.1 清洗数据
数据样例展示:cm ap et id
"cm":
{"ln":"-55.0",
"sv":"V2.9.6",
"os":"8.0.4",
"g":"C6816QZ0@gmail.com",
"mid":"489",
"nw":"3G",
"l":"es",
"vc":"4",
"hw":"640*960",
"ar":"MX",
"uid":"489",
"t":"1593123253541",
"la":"5.2",
"md":"sumsung-18",
"vn":"1.3.4",
"ba":"Sumsung",
"sr":"I"},
"ap":"app",
"et":[
{"ett":"1593050051366","en":"loading","kv":{"extend2":"","loading_time":"14","action":"3","extend1":"","type":"2","type1":"201","loading_way":"1"}},
{"ett":"1593108791764","en":"ad","kv":{"activityId":"1","displayMills":"78522","entry":"1","action":"1","contentType":"0"}},{"ett":"1593111271266","en":"notification","kv":{"ap_time":"1593097087883","action":"1","type":"1","content":""}},{"ett":"1593066033562","en":"active_background","kv":{"active_source":"3"}},
{"ett":"1593135644347","en":"comment","kv":{"p_comment_id":1,"addtime":"1593097573725","praise_count":973,"other_id":5,"comment_id":9,"reply_count":40,"userid":7,"content":"辑赤蹲慰鸽抿肘捎"}}]
"id":"1593136280858"
//日志上传hdfs
hdfs dfs -put /opt/op.log /logFile/
//开启spark
//读取日志文件
val lines = sc.textFile("hdfs://hadoop001:9000/logFile/op.log")//将id拆分出来
val rdd = lines.map(_.split('|')).map(x=>(x(0),x(1)))//将id补到json格式中
val jsonRdd = rdd.map(x=>{var jsonStr = x._2jsonStr = jsonStr.substring(0,jsonStr.length-1)jsonStr + ",\"id\":\""+ x._1 +"\"}"
})
1.2 rdd转dataFrame格式
//导包
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._//将rdd转dataFrame
val jsonDF = jsonRdd.toDF
//将json字符串 {"cm":"--" ,"ap":"--","et":"--","id":"--"} 结构化
val df = jsonDF.select(get_json_object($"value","$.cm").as("cm"),get_json_object($"value","$.ap").as("ap"),get_json_object($"value","$.et").as("et"),get_json_object($"value","$.id").as("id"))

1.3 提取’cm’中信息
val df2 = df.select($"id",$"ap",
get_json_object($"cm","$.ln").as("ln"),
get_json_object($"cm","$.sv").as("sv"),
get_json_object($"cm","$.os").as("os"),
get_json_object($"cm","$.g").as("g"),
get_json_object($"cm","$.mid").as("mid"),
get_json_object($"cm","$.l").as("l"),
get_json_object($"cm","$.vc").as("vc"),
get_json_object($"cm","$.hw").as("hw"),
get_json_object($"cm","$.ar").as("ar"