ModelValidator.scala 3.57 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
package com.cablelabs.eventgen.algorithm

import com.cablelabs.eventgen.analysis.SparkAnalyzer
import org.apache.spark.mllib.regression.LabeledPoint

/**
 * Validates the predictions for a given model
 */
object ModelValidator {

11 12 13 14
  def validateModel(model: Model, trainingSet: Array[LabeledPoint], analyzer: SparkAnalyzer, accuracy: Double,
                    outputValues: Boolean): Unit = model match {

    case model: LinearRegressionModel =>
15
    // Predict based on the actual training set
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
      var diff = 0d
      var totalPred = 0d
      var totalLabel = 0d
      var predCount = 0
      var minPred = 9999999999d
      var maxPred = 0d
      var minLabel = 9999999999d
      var maxLabel = 0d

      trainingSet.foreach(p => {
        val pred = model.predictRaw(p.features.toArray)
        totalPred += pred
        totalLabel += p.label
        predCount += 1
        diff += p.label - pred
        assert(p.label > -1)
        // TODO - This should always be < 0 but the current test data and algorithms are currently not working properly
  //      assert(pred > 0)
        if (pred < minPred) minPred = pred
        if (pred > maxPred) maxPred = pred
        if (p.label < minLabel) minLabel = p.label
        if (p.label > maxLabel) maxLabel = p.label

        if (outputValues) {
          println(s"label = ${p.label} - pred = $pred")
          //      if (pred < 1500000) println(s"pred < 1.5m label = ${p.label} - pred = $pred")
          //      if (pred > 10000000) println(s"pred > 10m label = ${p.label} - pred = $pred")
          //      assert(Math.abs(p.label - pred) < p.label * .1)
        }
      })

      val averageLabel = totalLabel / predCount
      val averagePred = totalPred / predCount
      val thresholdVal = averageLabel * accuracy

      println (s"min pred = $minPred")
      println (s"max pred = $maxPred")
      println (s"average prediction = $averagePred")
      println (s"min label = $minLabel")
      println (s"max label = $maxLabel")
      println (s"average label = $averageLabel")

      assert(predCount == trainingSet.size)
      // Check that cost of the entire set is < the threshold
      assert(thresholdVal < averagePred)

      // Predict based on the actual events used for training
      val events = analyzer.events().collect()
      assert(events.size == trainingSet.size)
      var totalPred2 = 0d
      events.foreach(p => {
        totalPred2 += model.predictRaw(analyzer.inputDef.temporalAlgoFeatures(p))
      })

      // Assure that both the actual feature sets used for training and the re-calculated versions from the events
      // return close predictions
      println(s"Total pred 1 = $totalPred, Total pred 2 = $totalPred2, Difference = ${Math.abs(totalPred - totalPred2)}")
      assert(Math.abs(totalPred - totalPred2) < 1000000)

    case model: NaiveBayesModel =>
      // Predict based on the actual training set
      var diff = 0d
      var predCount = 0
      var numMatches = 0

      trainingSet.foreach(p => {
        val pred = model.predict(p.features.toArray)
        predCount += 1
        val labelValue = model.labelValue(p.label)
        if (labelValue == pred) {
          numMatches += 1
        }
        if (outputValues) {
          println(s"label = $labelValue - pred = $pred")
        }
      })

      val successPercentage = numMatches.toDouble / predCount.toDouble
      println (s"% Success = $successPercentage")
      println (s"Number of keys = ${model.labelMap.size}")

      if (outputValues) println(s"LabelMap values - ${model.labelMap.values}")

      assert(predCount == trainingSet.size)
      assert(accuracy < successPercentage)
101 102 103
  }

}