RawFeatureFilter

Instance Constructors

new RawFeatureFilter(trainingReader: Reader[T], scoringReader: Option[Reader[T]], bins: Int, minFill: Double, maxFillDifference: Double, maxFillRatioDiff: Double, maxJSDivergence: Double, maxCorrelation: Double, correlationType: CorrelationType = CorrelationType.Pearson, jsDivergenceProtectedFeatures: Set[String] = Set.empty, protectedFeatures: Set[String] = Set.empty, textBinsFormula: (Summary, Int) ⇒ Int = RawFeatureFilter.textBinsFormula, timePeriod: Option[TimePeriod] = None, minScoringRows: Int = ...)

trainingReader

reader to get the training data

scoringReader

reader to get the scoring data for comparison (optional - if not present will exclude based on training data features only)

bins

number of bins to use in computing feature distributions (histograms for numerics, hashes for strings)

minFill

minimum fill rate a feature must have in the training dataset and scoring dataset to be kept

maxFillDifference

maximum acceptable fill rate difference between training and scoring data to be kept

maxFillRatioDiff

maximum acceptable fill ratio between training and scoring (larger / smaller)

maxJSDivergence

maximum Jensen-Shannon divergence between training and scoring distributions to be kept

maxCorrelation

maximum absolute correlation allowed between raw predictor null indicator and label

correlationType

type of correlation metric to use

jsDivergenceProtectedFeatures

features that are protected from removal by JS divergence check

protectedFeatures

features that are protected from removal

textBinsFormula

formula to compute the text features bin size. Input arguments are Summary and number of bins to use in computing feature distributions (histograms for numerics, hashes for strings). Output is the bins for the text features.

timePeriod

Time period used to apply circulate date transformation for date features, if not specified will use regular numeric feature transformation

minScoringRows

Minimum row threshold for scoring set comparisons to be used in checks. If the scoring set size is below this threshold, then only training data checks will be used

Value Members

final def !=(arg0: Any): Boolean

Definition Classes

AnyRef → Any
final def ##(): Int

Definition Classes

AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes

AnyRef → Any
final def asInstanceOf[T0]: T0

Definition Classes

Any
val bins: Int

number of bins to use in computing feature distributions (histograms for numerics, hashes for strings)
def clone(): AnyRef

Attributes

protected[java.lang]

Definition Classes

AnyRef

Annotations

@throws( ... )
val correlationType: CorrelationType

type of correlation metric to use
final def eq(arg0: AnyRef): Boolean

Definition Classes

AnyRef
def equals(arg0: Any): Boolean

Definition Classes

AnyRef → Any
def finalize(): Unit

Attributes

protected[java.lang]

Definition Classes

AnyRef

Annotations

@throws( classOf[java.lang.Throwable] )
def generateFilteredRaw(rawFeatures: Array[OPFeature], parameters: OpParams)(implicit spark: SparkSession): FilteredRawData

Function that gets raw features and params used in workflow.

Function that gets raw features and params used in workflow. Will use this information along with readers for this stage to determine which features should be dropped from the workflow

rawFeatures

raw features used in the workflow

parameters

parameters used in the workflow

spark

spark instance

returns

dataframe that has had bad features and bad map keys removed and a list of all features that should be dropped from the DAG
final def getClass(): Class[_]

Definition Classes

AnyRef → Any
def hashCode(): Int

Definition Classes

AnyRef → Any
final def isInstanceOf[T0]: Boolean

Definition Classes

Any
val jsDivergenceProtectedFeatures: Set[String]

features that are protected from removal by JS divergence check
lazy val log: Logger

Attributes

protected
val maxCorrelation: Double

maximum absolute correlation allowed between raw predictor null indicator and label
val maxFillDifference: Double

maximum acceptable fill rate difference between training and scoring data to be kept
val maxFillRatioDiff: Double

maximum acceptable fill ratio between training and scoring (larger / smaller)
val maxJSDivergence: Double

maximum Jensen-Shannon divergence between training and scoring distributions to be kept
val minFill: Double

minimum fill rate a feature must have in the training dataset and scoring dataset to be kept
val minScoringRows: Int

Minimum row threshold for scoring set comparisons to be used in checks.

Minimum row threshold for scoring set comparisons to be used in checks. If the scoring set size is below this threshold, then only training data checks will be used
final def ne(arg0: AnyRef): Boolean

Definition Classes

AnyRef
final def notify(): Unit

Definition Classes

AnyRef
final def notifyAll(): Unit

Definition Classes

AnyRef
val protectedFeatures: Set[String]

features that are protected from removal
val scoringReader: Option[Reader[T]]

reader to get the scoring data for comparison (optional - if not present will exclude based on training data features only)
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes

AnyRef
val textBinsFormula: (Summary, Int) ⇒ Int

formula to compute the text features bin size.

formula to compute the text features bin size. Input arguments are Summary and number of bins to use in computing feature distributions (histograms for numerics, hashes for strings). Output is the bins for the text features.
val timePeriod: Option[TimePeriod]

Time period used to apply circulate date transformation for date features, if not specified will use regular numeric feature transformation
def toString(): String

Definition Classes

AnyRef → Any
val trainingReader: Reader[T]

reader to get the training data
final def wait(): Unit

Definition Classes

AnyRef

Annotations

@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes

AnyRef

Annotations

@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes

AnyRef

Annotations

@throws( ... )

Related Docs: object RawFeatureFilter | package filters

class RawFeatureFilter[T] extends Serializable

Instance Constructors

Value Members

final def !=(arg0: Any): Boolean

final def ##(): Int

final def ==(arg0: Any): Boolean

final def asInstanceOf[T0]: T0

val bins: Int

def clone(): AnyRef

val correlationType: CorrelationType

final def eq(arg0: AnyRef): Boolean

def equals(arg0: Any): Boolean

def finalize(): Unit

def generateFilteredRaw(rawFeatures: Array[OPFeature], parameters: OpParams)(implicit spark: SparkSession): FilteredRawData

final def getClass(): Class[_]

def hashCode(): Int

final def isInstanceOf[T0]: Boolean

val jsDivergenceProtectedFeatures: Set[String]

lazy val log: Logger

val maxCorrelation: Double

val maxFillDifference: Double

val maxFillRatioDiff: Double

val maxJSDivergence: Double

val minFill: Double

val minScoringRows: Int

final def ne(arg0: AnyRef): Boolean

final def notify(): Unit

final def notifyAll(): Unit

val protectedFeatures: Set[String]

val scoringReader: Option[Reader[T]]

final def synchronized[T0](arg0: ⇒ T0): T0

val textBinsFormula: (Summary, Int) ⇒ Int

val timePeriod: Option[TimePeriod]

def toString(): String

val trainingReader: Reader[T]

final def wait(): Unit

final def wait(arg0: Long, arg1: Int): Unit

final def wait(arg0: Long): Unit

Inherited from Serializable

Inherited from Serializable

Inherited from AnyRef

Inherited from Any

Ungrouped