Commit ad99dbc9 authored by Steven Pisarski's avatar Steven Pisarski

CM fact prediction tuning.

parent af6e9c1e
......@@ -216,7 +216,8 @@ object AnalyzeData extends App {
val countMap = mutable.Map[Long, mutable.Set[String]]()
// TODO - this routine possibly can be optimized with RDD but appears to be performing adequately
analyzer.dimEventsCount().foreach(p => {
val dimEventsCount = analyzer.dimEventsCount()
dimEventsCount.foreach(p => {
val value = countMap.get(p._2)
if (value == None) {
countMap += (p._2 -> mutable.Set[String](p._1))
......@@ -228,8 +229,13 @@ object AnalyzeData extends App {
val fileSystem = FileSystem.get(new URI(config.dimEventsCountUri), new Configuration())
val stream = fileSystem.create(new Path(config.dimEventsCountUri))
countMap.toList sortBy (_._1) foreach {
case (key, value) => stream.write(s"$key events for ${value.size} dimensions\n".getBytes)
case (key, value) =>
stream.write(s"$key ${if (key > 1) "events" else "event"} for ${value.size} dimensions\n".getBytes)
}
stream.write("\n".getBytes)
dimEventsCount.toList.sortBy(_._2).foreach(f => stream.write(s"${f._1}\t${f._2}\n".getBytes))
stream.close()
logger.info(s"Completed persisting the count of events by dimensional set ${config.dimEventsCountUri}")
}, scheduler, "dimEventsCount")
......
......@@ -25,7 +25,7 @@ class AnalyzeDataYamlTest extends UnitSpec {
assert("local[8]" == config.sparkUri)
assert("CM-Analyzer-small" == config.appName)
assert(config.inputDef != null)
assert(16 == config.inputDef.fieldMap.size)
assert(9 == config.inputDef.fieldMap.size)
assert('|' == config.fileDelim)
assert("testData/cm/events/cm_1a.txt" == config.eventsUri)
assert("/tmp/cm/analysis/temporalTrainingSet.rdd" == config.temporalTrainingSetUri)
......@@ -99,7 +99,7 @@ eventsUri: testData/cm/events/cm_1a.txt"""
assert(null == config.sparkUri)
assert("DataAnalyzer" == config.appName)
assert(config.inputDef != null)
assert(16 == config.inputDef.fieldMap.size)
assert(9 == config.inputDef.fieldMap.size)
assert(',' == config.fileDelim)
assert("testData/cm/events/cm_1a.txt" == config.eventsUri)
assert(null == config.temporalTrainingSetUri)
......
......@@ -36,7 +36,7 @@ class GeneratorYamlTest extends UnitSpec {
assert(config.seedFilters.contains("99:2b:b2:11:4k:k4"))
assert(config.seedFilters.contains("99:46:8g:b9:10:9z"))
assert(config.inputDef != null)
assert(16 == config.inputDef.fieldMap.size)
assert(9 == config.inputDef.fieldMap.size)
assert(2 == config.outputDefs.size)
assert(',' == config.fileDelim)
assert(null == config.eventsUri)
......@@ -80,7 +80,7 @@ seedFilters:
assert(config.seedFilters.contains("foo"))
assert(config.seedFilters.contains("bar"))
assert(config.inputDef != null)
assert(16 == config.inputDef.fieldMap.size)
assert(9 == config.inputDef.fieldMap.size)
assert(2 == config.outputDefs.size)
assert(',' == config.fileDelim)
assert(null == config.eventsUri)
......@@ -184,7 +184,7 @@ seedEventsUri: seedEvents.rdd"""
assert(!config.analyzeFirst)
assert(0 == config.seedFilters.size)
assert(config.inputDef != null)
assert(16 == config.inputDef.fieldMap.size)
assert(9 == config.inputDef.fieldMap.size)
assert(2 == config.outputDefs.size)
assert(',' == config.fileDelim)
assert(null == config.eventsUri)
......@@ -231,7 +231,7 @@ seedFilters:
assert(config.seedFilters.contains("foo"))
assert(config.seedFilters.contains("bar"))
assert(config.inputDef != null)
assert(16 == config.inputDef.fieldMap.size)
assert(9 == config.inputDef.fieldMap.size)
assert(2 == config.outputDefs.size)
assert(',' == config.fileDelim)
assert(null == config.eventsUri)
......@@ -264,7 +264,7 @@ fileDelim: "|"
assert(config.analyzeFirst)
assert(0 == config.seedFilters.size)
assert(config.inputDef != null)
assert(16 == config.inputDef.fieldMap.size)
assert(9 == config.inputDef.fieldMap.size)
assert(2 == config.outputDefs.size)
assert('|' == config.fileDelim)
assert("events.file" == config.eventsUri)
......@@ -296,7 +296,7 @@ eventsUri: events.file
assert(config.analyzeFirst)
assert(0 == config.seedFilters.size)
assert(config.inputDef != null)
assert(16 == config.inputDef.fieldMap.size)
assert(9 == config.inputDef.fieldMap.size)
assert(2 == config.outputDefs.size)
assert(',' == config.fileDelim)
assert("events.file" == config.eventsUri)
......@@ -340,7 +340,7 @@ seedFilters:
assert(config.seedFilters.contains("foo"))
assert(config.seedFilters.contains("bar"))
assert(config.inputDef != null)
assert(16 == config.inputDef.fieldMap.size)
assert(9 == config.inputDef.fieldMap.size)
assert(2 == config.outputDefs.size)
assert('|' == config.fileDelim)
assert("events.file" == config.eventsUri)
......@@ -390,7 +390,7 @@ remoteActorAddr:
assert(config.analyzeFirst)
assert(0 == config.seedFilters.size)
assert(config.inputDef != null)
assert(16 == config.inputDef.fieldMap.size)
assert(9 == config.inputDef.fieldMap.size)
assert(2 == config.outputDefs.size)
assert(',' == config.fileDelim)
assert("events.file" == config.eventsUri)
......
......@@ -17,7 +17,6 @@ class CmFactPredictionsTest extends FactPredictionsTester {
val eventUri = new File("testData/cm/events/cm_1a.txt").toURI.toString
val delim = '|'
// TODO - Determine why this fact is not predicting well at all
ignoreFacts = Set("downstream_receive_power_num")
// Set to true to see the predicted values in the output log
// outputValues = true
}
......@@ -11,6 +11,7 @@ abstract class FactPredictionsTester extends AnalyzerTester {
// Added this hack just in case there is a problematic fact that just cannot be properly predicted with a
// minimal training set
var ignoreFacts = Set[String]()
var outputValues: Boolean = false
analyzerTest("Analyze fact preditions to ensure the average prediction is within 90% of the average label") {
inputDef.positionalFacts.filter(f => !ignoreFacts.contains(f.name))foreach(fact => {
......@@ -26,7 +27,7 @@ abstract class FactPredictionsTester extends AnalyzerTester {
}
println(s"Validating model for fact with name - ${fact.name}")
ModelValidator.validateModel(model, trainingSet.collect(), analyzer, 0.90, outputValues = false)
ModelValidator.validateModel(model, trainingSet.collect(), analyzer, 0.90, outputValues = outputValues)
})
}
}
......@@ -57,360 +57,90 @@ facts:
- name: downstream_receive_power_num
description: fact 1-out
type: float
position: 140
position: 10
algo:
name: linearRegression
flatten:
mode: log
base: 2
iterations: 3
base: 5
iterations: 2
polyDegree: 3
iterations: 75
stepSize: 0.002
stepSize: 0.25
omitFields:
- day_of_week
- day_of_year
- hour_of_day
- cmts
- node
weights:
- name: day_of_week
weight: 10
- name: day_of_month
weight: 10
- name: day_of_year
weight: 10
- name: hour_of_day
weight: 50
- name: month_of_year
weight: 25
- name: year
weight: 10
- name: cmts
weight: 150
- name: node
weight: 200
- name: mac
weight: 250
weight: 325
- name: lat
weight: 250
weight: 200
- name: lng
weight: 250
weight: 200
- name: month_of_year
weight: 33
- name: day_of_month
weight: 33
- name: upstream_transmit_power_num
description: fact 2-out
type: float
position: 110
position: 20
algo:
name: linearRegression
flatten:
mode: log
base: 5
iterations: 2
polyDegree: 2
polyDegree: 3
iterations: 75
stepSize: 0.04
stepSize: 0.25
omitFields:
- day_of_week
- day_of_year
- hour_of_day
- cmts
- node
weights:
- name: day_of_week
weight: 10
- name: day_of_month
weight: 10
- name: day_of_year
weight: 10
- name: hour_of_day
weight: 100
- name: month_of_year
weight: 50
- name: year
weight: 10
- name: cmts
weight: 100
- name: node
weight: 100
- name: mac
weight: 150
weight: 325
- name: lat
weight: 150
weight: 200
- name: lng
weight: 150
weight: 200
- name: month_of_year
weight: 33
- name: day_of_month
weight: 33
- name: downstream_snr_rt
description: fact 3-out
type: float
position: 130
algo:
name: linearRegression
flatten:
mode: log
base: 5
iterations: 2
polyDegree: 2
iterations: 75
stepSize: 0.05
weights:
- name: day_of_week
weight: 10
- name: day_of_month
weight: 10
- name: day_of_year
weight: 10
- name: hour_of_day
weight: 100
- name: month_of_year
weight: 50
- name: year
weight: 10
- name: cmts
weight: 100
- name: node
weight: 100
- name: mac
weight: 150
- name: lat
weight: 150
- name: lng
weight: 150
- name: t3_timeouts_cnt
description: fact 4
type: integer
position: 40
algo:
name: linearRegression
flatten:
mode: log
base: 5
iterations: 2
polyDegree: 4
iterations: 30
stepSize: 0.02
weights:
- name: day_of_week
weight: 10
- name: day_of_month
weight: 10
- name: day_of_year
weight: 10
- name: hour_of_day
weight: 100
- name: month_of_year
weight: 50
- name: year
weight: 10
- name: cmts
weight: 100
- name: node
weight: 100
- name: mac
weight: 150
- name: lat
weight: 150
- name: lng
weight: 150
- name: t4_timeouts_cnt
description: fact 4b
type: integer
position: 41
position: 30
algo:
name: linearRegression
flatten:
mode: log
base: 5
iterations: 2
polyDegree: 4
iterations: 30
stepSize: 0.02
weights:
- name: day_of_week
weight: 10
- name: day_of_month
weight: 10
- name: day_of_year
weight: 10
- name: hour_of_day
weight: 100
- name: month_of_year
weight: 50
- name: year
weight: 10
- name: cmts
weight: 100
- name: node
weight: 100
- name: mac
weight: 150
- name: lat
weight: 150
- name: lng
weight: 150
- name: lost_syncs_cnt
description: fact 5
type: integer
position: 50
algo:
name: linearRegression
flatten:
mode: log
base: 5
iterations: 3
polyDegree: 3
iterations: 50
stepSize: 0.02
weights:
- name: day_of_week
weight: 10
- name: day_of_month
weight: 10
- name: day_of_year
weight: 10
- name: hour_of_day
weight: 100
- name: month_of_year
weight: 50
- name: year
weight: 10
- name: cmts
weight: 100
- name: node
weight: 100
- name: mac
weight: 150
- name: lat
weight: 150
- name: lng
weight: 150
- name: resets_cnt
description: fact 6
type: integer
position: 60
algo:
name: linearRegression
flatten:
mode: log
base: 5
iterations: 2
polyDegree: 2
iterations: 70
stepSize: 0.025
weights:
- name: day_of_week
weight: 10
- name: day_of_month
weight: 10
- name: day_of_year
weight: 10
- name: hour_of_day
weight: 100
- name: month_of_year
weight: 50
- name: year
weight: 10
- name: cmts
weight: 100
- name: node
weight: 100
- name: mac
weight: 150
- name: lat
weight: 150
- name: lng
weight: 150
- name: ds_fec_corrected_cnt
description: fact 7
type: integer
position: 70
algo:
name: linearRegression
flatten:
mode: log
base: 5
iterations: 2
polyDegree: 4
iterations: 30
stepSize: 0.02
iterations: 75
stepSize: 0.25
omitFields:
- day_of_week
- day_of_year
- hour_of_day
- cmts
- node
weights:
- name: day_of_week
weight: 10
- name: day_of_month
weight: 10
- name: day_of_year
weight: 10
- name: hour_of_day
weight: 100
- name: month_of_year
weight: 50
- name: year
weight: 10
- name: cmts
weight: 100
- name: node
weight: 100
- name: mac
weight: 150
weight: 325
- name: lat
weight: 150
weight: 200
- name: lng
weight: 150
- name: ds_fec_uncorrected_cnt
description: fact 8
type: integer
position: 80
algo:
name: linearRegression
flatten:
mode: log
base: 5
iterations: 2
polyDegree: 4
iterations: 30
stepSize: 0.02
weights:
- name: day_of_week
weight: 10
- name: day_of_month
weight: 10
- name: day_of_year
weight: 10
- name: hour_of_day
weight: 100
weight: 200
- name: month_of_year
weight: 50
- name: year
weight: 10
- name: cmts
weight: 100
- name: node
weight: 100
- name: mac
weight: 150
- name: lat
weight: 150
- name: lng
weight: 150
- name: ds_fec_unerrored_cnt
description: fact 9
type: integer
position: 90
algo:
name: linearRegression
flatten:
mode: log
base: 5
iterations: 2
polyDegree: 4
iterations: 30
stepSize: 0.02
weights:
- name: day_of_week
weight: 10
weight: 33
- name: day_of_month
weight: 10
- name: day_of_year
weight: 10
- name: hour_of_day
weight: 100
- name: month_of_year
weight: 50
- name: year
weight: 10
- name: cmts
weight: 100
- name: node
weight: 100
- name: mac
weight: 150
- name: lat
weight: 150
- name: lng
weight: 150
weight: 33
schemaUri: hdfs://bda-hdfs01/tmp/cm/conf/input/cm-constant-prod-input.yaml
schemaUri: hdfs://bda-hdfs01/tmp/cm/conf/input/cm-constant-short-input.yaml
fileDelim: "|"
eventsUri: hdfs://bda-hdfs01/tmp/cm/data
temporalTrainingSetUri: hdfs://bda-hdfs01/tmp/cm/analysis/temporalTrainingSet.rdd
......@@ -7,7 +7,7 @@ seedEventsUri: hdfs://bda-hdfs01/tmp/cm/analysis/seedEvents.rdd
eventCountUri: hdfs://bda-hdfs01/tmp/cm/analysis/eventCount.txt
dimCountUri: hdfs://bda-hdfs01/tmp/cm/analysis/dimCount.txt
dimEventsCountUri: hdfs://bda-hdfs01/tmp/cm/analysis/dimEventsCount.txt
factValuesUri: hdfs://bda-hdfs01/tmp/cm/analysis/factValues.txt
#factValuesUri: hdfs://bda-hdfs01/tmp/cm/analysis/factValues.txt
temporalTrainingMetricsUri: hdfs://bda-hdfs01/tmp/cm/analysis/temporalTrainingMetrics.txt
factTrainingMetricsUri: hdfs://bda-hdfs01/tmp/cm/analysis/factTrainingMetrics
numThreads: 20
\ No newline at end of file
schemaUri: hdfs://bda-hdfs01/tmp/cm/conf/input/cm-constant-prod-input.yaml
outputDefUri: hdfs://bda-hdfs01/tmp/cm/conf/output/cm-out.yaml
temporalTrainingSetUri: hdfs://bda-hdfs01/tmp/cm/analysis/temporalTrainingSet.rdd
factTrainingSetUri: hdfs://bda-hdfs01/tmp/cm/analysis/factTrainingSet.rdd
factTrainingSetUri: hdfs://bda-hdfs01/tmp/cm/analysis/factTrainingSets
seedEventsUri: hdfs://bda-hdfs01/tmp/cm/analysis/seedEvents.rdd
sendPastEvents: true
useNow: true
......
schemaUri: hdfs://bda-hdfs01/tmp/cm/conf/input/cm-constant-prod-input.yaml
outputDefUri: hdfs://bda-hdfs01/tmp/cm/conf/output/cm-out.yaml
temporalTrainingSetUri: hdfs://bda-hdfs01/tmp/cm/analysis/temporalTrainingSet.rdd
factTrainingSetUri: hdfs://bda-hdfs01/tmp/cm/analysis/factTrainingSet.rdd
factTrainingSetUri: hdfs://bda-hdfs01/tmp/cm/analysis/factTrainingSets
seedEventsUri: hdfs://bda-hdfs01/tmp/cm/analysis/seedEvents.rdd
sendPastEvents: true
useNow: true
......
schemaUri: hdfs://bda-hdfs01/tmp/cm/conf/input/cm-constant-short-input.yaml
outputDefUri: hdfs://bda-hdfs01/tmp/cm/conf/output/cm-out.yaml
temporalTrainingSetUri: hdfs://bda-hdfs01/tmp/cm/analysis/temporalTrainingSet.rdd
factTrainingSetUri: hdfs://bda-hdfs01/tmp/cm/analysis/factTrainingSet.rdd
factTrainingSetUri: hdfs://bda-hdfs01/tmp/cm/analysis/factTrainingSets
seedEventsUri: hdfs://bda-hdfs01/tmp/cm/analysis/seedEvents.rdd
sendPastEvents: true
useNow: true
......
schemaUri: hdfs://bda-hdfs01/tmp/cm/conf/input/cm-constant-short-input.yaml
outputDefUri: hdfs://bda-hdfs01/tmp/cm/conf/output/cm-out.yaml
temporalTrainingSetUri: hdfs://bda-hdfs01/tmp/cm/analysis/temporalTrainingSet.rdd
factTrainingSetUri: hdfs://bda-hdfs01/tmp/cm/analysis/factTrainingSet.rdd
factTrainingSetUri: hdfs://bda-hdfs01/tmp/cm/analysis/factTrainingSets
seedEventsUri: hdfs://bda-hdfs01/tmp/cm/analysis/seedEvents.rdd
sendPastEvents: true
useNow: true
......
......@@ -38,360 +38,90 @@ facts:
- name: downstream_receive_power_num
description: fact 1-out
type: float
position: 140
position: 10
algo:
name: linearRegression
flatten:
mode: log
base: 2
iterations: 3
base: 5
iterations: 2
polyDegree: 3
iterations: 75
stepSize: 0.002
stepSize: 0.25
omitFields:
- day_of_week
- day_of_year
- hour_of_day
- cmts
- node
weights:
- name: day_of_week
weight: 10
- name: day_of_month
weight: 10
- name: day_of_year
weight: 10
- name: hour_of_day
weight: 50
- name: month_of_year
weight: 25
- name: year
weight: 10
- name: cmts
weight: 150
- name: node
weight: 200
- name: mac
weight: 250
weight: 325
</