OpWorkflowModel

Instance Constructors

new OpWorkflowModel(uid: String = UID[OpWorkflowModel], trainingParams: OpParams)

uid

unique identifier for this workflow model

trainingParams

params that were used during model training

Value Members

final def !=(arg0: Any): Boolean

Definition Classes

AnyRef → Any
final def ##(): Int

Definition Classes

AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes

AnyRef → Any
def applyTransformationsDAG(rawData: DataFrame, dag: StagesDAG, persistEveryKStages: Int)(implicit spark: SparkSession): DataFrame

Efficiently applies all fitted stages grouping by level in the DAG where possible

Efficiently applies all fitted stages grouping by level in the DAG where possible

rawData

data to transform

dag

computation graph

persistEveryKStages

breaks in computation to persist

spark

spark session

returns

transformed dataframe

Attributes

protected

Definition Classes

OpWorkflowCore
final def asInstanceOf[T0]: T0

Definition Classes

Any
var blocklistedFeatures: Array[OPFeature]

Attributes

protected

Definition Classes

OpWorkflowCore
var blocklistedMapKeys: Map[String, Set[String]]

Attributes

protected

Definition Classes

OpWorkflowCore
def checkReadersAndFeatures(): Unit

Check that readers and features are set and that params match them

Check that readers and features are set and that params match them

Attributes

protected

Definition Classes

OpWorkflowCore
def checkUnmatchedFeatures(): Unit

Determine if any of the raw features do not have a matching reader

Determine if any of the raw features do not have a matching reader

Attributes

protected

Definition Classes

OpWorkflowCore
def clone(): AnyRef

Attributes

protected[java.lang]

Definition Classes

AnyRef

Annotations

@throws( ... )
def computeDataUpTo(feature: OPFeature, persistEveryKStages: Int = OpWorkflowModel.PersistEveryKStages)(implicit spark: SparkSession): DataFrame

Returns a dataframe containing all the columns generated up to and including the feature input

Returns a dataframe containing all the columns generated up to and including the feature input

feature

input feature to compute up to

persistEveryKStages

persist data in transforms every k stages for performance improvement

returns

Dataframe containing columns corresponding to all of the features generated up to the feature given

Definition Classes

OpWorkflowModel → OpWorkflowCore

Exceptions thrown

IllegalArgumentException if a feature is not part of this workflow
def computeDataUpTo(feature: OPFeature, path: String)(implicit spark: SparkSession): Unit

Computes a dataframe containing all the columns generated up to the feature input and saves it to the specified path in avro format

Computes a dataframe containing all the columns generated up to the feature input and saves it to the specified path in avro format

Definition Classes

OpWorkflowCore
def copy(): OpWorkflowModel

Creates a copy of this OpWorkflowModel instance

Creates a copy of this OpWorkflowModel instance

returns

copy of this OpWorkflowModel instance
final def eq(arg0: AnyRef): Boolean

Definition Classes

AnyRef
def equals(arg0: Any): Boolean

Definition Classes

AnyRef → Any
def evaluate[T <: EvaluationMetrics](evaluator: OpEvaluatorBase[T], metricsPath: Option[String] = None, scoresPath: Option[String] = None)(implicit arg0: ClassTag[T], spark: SparkSession): T

Load up the data by the reader, transform it and then evaluate

Load up the data by the reader, transform it and then evaluate

evaluator

OP Evaluator

metricsPath

path to write out the metrics

spark

spark session

returns

evaluation metrics
def finalize(): Unit

Attributes

protected[java.lang]

Definition Classes

AnyRef

Annotations

@throws( classOf[java.lang.Throwable] )
def findOriginStageId(feature: OPFeature): Option[Int]

Looks at model parents to match parent stage for features (since features are created from the estimator not the fitted transformer)

Looks at model parents to match parent stage for features (since features are created from the estimator not the fitted transformer)

feature

feature want to find origin stage for

returns

index of the parent stage

Attributes

protected

Definition Classes

OpWorkflowCore
def generateRawData()(implicit spark: SparkSession): DataFrame

Used to generate dataframe from reader and raw features list

Used to generate dataframe from reader and raw features list

returns

Dataframe with all the features generated + persisted

Attributes

protected

Definition Classes

OpWorkflowModel → OpWorkflowCore
final def getAllFeatures(): Array[OPFeature]

Get all the features that potentially are generated by the workflow: raw, intermediate and result features

Get all the features that potentially are generated by the workflow: raw, intermediate and result features

returns

all the features that potentially are generated by the workflow: raw, intermediate and result features

Definition Classes

OpWorkflowCore
final def getBlocklist(): Array[OPFeature]

Get the list of raw features which have been blocklisted

Get the list of raw features which have been blocklisted

returns

blocklisted features

Definition Classes

OpWorkflowCore
final def getBlocklistMapKeys(): Map[String, Set[String]]

Get the list of Map Keys which have been blocklisted

Get the list of Map Keys which have been blocklisted

returns

blocklisted map keys

Definition Classes

OpWorkflowCore
final def getClass(): Class[_]

Definition Classes

AnyRef → Any
def getMetadata(features: OPFeature*): Map[OPFeature, Metadata]

Get the metadata associated with the features

Get the metadata associated with the features

features

features to get metadata for

returns

metadata associated with the features

Exceptions thrown

IllegalArgumentException if a feature is not part of this workflow
def getOriginStageOf[T <: FeatureType](feature: FeatureLike[T]): OpPipelineStage[T]

Gets the fitted stage that generates the input feature

Gets the fitted stage that generates the input feature

T

Type of feature

feature

feature want the origin stage for

returns

Fitted origin stage for feature

Exceptions thrown

IllegalArgumentException if a feature is not part of this workflow
final def getParameters(): OpParams

Get the parameter settings passed into the workflow

Get the parameter settings passed into the workflow

returns

OpWorkflowParams set for this workflow

Definition Classes

OpWorkflowCore
final def getRawFeatureDistributions(): Seq[FeatureDistribution]

Get raw feature distribution information computed on training and scoring data during raw feature filter

Get raw feature distribution information computed on training and scoring data during raw feature filter

returns

sequence of feature distribution information

Definition Classes

OpWorkflowCore
final def getRawFeatureFilterResults(): RawFeatureFilterResults

Get raw feature filter results (filter configuration, feature distributions, and feature exclusion reasons)

Get raw feature filter results (filter configuration, feature distributions, and feature exclusion reasons)

returns

raw feature filter results

Definition Classes

OpWorkflowCore
final def getRawFeatures(): Array[OPFeature]

Get the raw features generated by the workflow

Get the raw features generated by the workflow

returns

raw features for workflow

Definition Classes

OpWorkflowCore
final def getRawScoringFeatureDistributions(): Seq[FeatureDistribution]

Get raw feature distribution information computed on scoring data during raw feature filter

Get raw feature distribution information computed on scoring data during raw feature filter

returns

sequence of feature distribution information

Definition Classes

OpWorkflowCore
final def getRawTrainingFeatureDistributions(): Seq[FeatureDistribution]

Get raw feature distribution information computed on training data during raw feature filter

Get raw feature distribution information computed on training data during raw feature filter

returns

sequence of feature distribution information

Definition Classes

OpWorkflowCore
final def getReader(): Reader[_]

Get data reader that will be used to generate data frame for stages

Get data reader that will be used to generate data frame for stages

returns

reader for workflow

Definition Classes

OpWorkflowCore
final def getResultFeatures(): Array[OPFeature]

Get the final features generated by the workflow

Get the final features generated by the workflow

returns

result features for workflow

Definition Classes

OpWorkflowCore
final def getStages(): Array[OPStage]

Get the stages used in this workflow

Get the stages used in this workflow

returns

stages in the workflow

Definition Classes

OpWorkflowCore
def getUpdatedFeatures(features: Array[OPFeature]): Array[OPFeature]

Gets the updated version of a feature when the DAG has been modified with a raw feature filter

Gets the updated version of a feature when the DAG has been modified with a raw feature filter

features

feature want a the updated history for

returns

Updated instance of feature

Exceptions thrown

IllegalArgumentException if a feature is not part of this workflow
def hashCode(): Int

Definition Classes

AnyRef → Any
final def isInstanceOf[T0]: Boolean

Definition Classes

Any
final def isWorkflowCV: Boolean

Whether the cross-validation/train-validation-split will be done at workflow level

Whether the cross-validation/train-validation-split will be done at workflow level

returns

true if the cross-validation will be done at workflow level, false otherwise

Definition Classes

OpWorkflowCore
var isWorkflowCVEnabled: Boolean

Attributes

protected

Definition Classes

OpWorkflowCore
lazy val log: Logger

Attributes

protected

Definition Classes

OpWorkflowCore
def modelInsights(feature: OPFeature): ModelInsights

Get model insights for the model used to create the input feature.

Get model insights for the model used to create the input feature. Will traverse the DAG to find the LAST model selector and sanity checker used in the creation of the selected feature

feature

feature to find model info for

returns

Model insights class containing summary of modeling and sanity checking
final def ne(arg0: AnyRef): Boolean

Definition Classes

AnyRef
final def notify(): Unit

Definition Classes

AnyRef
final def notifyAll(): Unit

Definition Classes

AnyRef
var parameters: OpParams

Attributes

protected

Definition Classes

OpWorkflowCore
var rawFeatureFilterResults: RawFeatureFilterResults

Attributes

protected

Definition Classes

OpWorkflowCore
var rawFeatures: Array[OPFeature]

Attributes

protected

Definition Classes

OpWorkflowCore
var reader: Option[Reader[_]]

Attributes

protected

Definition Classes

OpWorkflowCore
var resultFeatures: Array[OPFeature]

Attributes

protected

Definition Classes

OpWorkflowCore
def save(path: String, overwrite: Boolean = true, modelStagingDir: String = WorkflowFileReader.modelStagingDir): Unit

Save this model to a path

Save this model to a path

path

path to save the model

overwrite

should overwrite if the path exists

modelStagingDir

local folder to copy and unpack stored model to for loading
def score(path: Option[String] = None, keepRawFeatures: Boolean = OpWorkflowModel.KeepRawFeatures, keepIntermediateFeatures: Boolean = ..., persistEveryKStages: Int = OpWorkflowModel.PersistEveryKStages, persistScores: Boolean = OpWorkflowModel.PersistScores)(implicit spark: SparkSession): DataFrame

Load up the data as specified by the data reader then transform that data using the transformers specified in this workflow.

Load up the data as specified by the data reader then transform that data using the transformers specified in this workflow. We will always keep the key and result features in the returned dataframe, but there are options to keep the other raw & intermediate features.

This method optimizes scoring by grouping applying bulks of OpTransformer stages on each step. The rest of the stages go are applied sequentially (as org.apache.spark.ml.Pipeline does)

path

optional path to write out the scores to a file

keepRawFeatures

flag to enable keeping raw features in the output DataFrame as well

keepIntermediateFeatures

flag to enable keeping intermediate features in the output DataFrame as well

persistEveryKStages

how often to break up catalyst by persisting the data (applies for non OpTransformer stages only), to turn off set to Int.MaxValue (not recommended)

persistScores

should persist the final scores dataframe

returns

Dataframe that contains all the columns generated by the transformers in this workflow model as well as the key and result features, along with other features if the above flags are set to true.
def scoreAndEvaluate(evaluator: OpEvaluatorBase[_ <: EvaluationMetrics], path: Option[String] = None, keepRawFeatures: Boolean = OpWorkflowModel.KeepRawFeatures, keepIntermediateFeatures: Boolean = ..., persistEveryKStages: Int = OpWorkflowModel.PersistEveryKStages, persistScores: Boolean = OpWorkflowModel.PersistScores, metricsPath: Option[String] = None)(implicit spark: SparkSession): (DataFrame, EvaluationMetrics)

Load up the data as specified by the data reader then transform that data using the transformers specified in this workflow.

Load up the data as specified by the data reader then transform that data using the transformers specified in this workflow. We will always keep the key and result features in the returned dataframe, but there are options to keep the other raw & intermediate features.

This method optimizes scoring by grouping applying bulks of OpTransformer stages on each step. The rest of the stages go are applied sequentially (as org.apache.spark.ml.Pipeline does)

evaluator

evalutator to use for metrics generation

path

optional path to write out the scores to a file

keepRawFeatures

flag to enable keeping raw features in the output DataFrame as well

keepIntermediateFeatures

flag to enable keeping intermediate features in the output DataFrame as well

persistEveryKStages

how often to break up catalyst by persisting the data (applies for non OpTransformer stages only), to turn off set to Int.MaxValue (not recommended)

persistScores

should persist the final scores dataframe

metricsPath

optional path to write out the metrics to a file

returns

Dataframe that contains all the columns generated by the transformers in this workflow model as well as the key and result features, along with other features if the above flags are set to true. Also returns metrics computed with evaluator.
def setBlocklist(features: Array[OPFeature]): OpWorkflowModel.this.type

Attributes

protected[com.salesforce.op]
def setBlocklistMapKeys(mapKeys: Map[String, Set[String]]): OpWorkflowModel.this.type

Attributes

protected[com.salesforce.op]
def setFeatures(features: Array[OPFeature]): OpWorkflowModel.this.type

Attributes

protected[com.salesforce.op]
final def setInputDataset[T](ds: Dataset[T], key: (T) ⇒ String = ReaderKey.randomKey)(implicit arg0: scala.reflect.api.JavaUniverse.WeakTypeTag[T]): OpWorkflowModel.this.type

Set input dataset which contains columns corresponding to the raw features used in the workflow The type of the dataset (Dataset[T]) must match the type of the FeatureBuilders[T] used to generate the raw features

Set input dataset which contains columns corresponding to the raw features used in the workflow The type of the dataset (Dataset[T]) must match the type of the FeatureBuilders[T] used to generate the raw features

ds

input dataset for workflow

key

key extract function

returns

this workflow

Definition Classes

OpWorkflowCore
final def setInputRDD[T](rdd: RDD[T], key: (T) ⇒ String = ReaderKey.randomKey)(implicit arg0: scala.reflect.api.JavaUniverse.WeakTypeTag[T]): OpWorkflowModel.this.type

Set input rdd which contains columns corresponding to the raw features used in the workflow The type of the rdd (RDD[T]) must match the type of the FeatureBuilders[T] used to generate the raw features

Set input rdd which contains columns corresponding to the raw features used in the workflow The type of the rdd (RDD[T]) must match the type of the FeatureBuilders[T] used to generate the raw features

rdd

input rdd for workflow

key

key extract function

returns

this workflow

Definition Classes

OpWorkflowCore
final def setParameters(newParams: OpParams): OpWorkflowModel.this.type

Set reader parameters from OpWorkflowParams object for run (stage parameters passed in will have no effect)

Set reader parameters from OpWorkflowParams object for run (stage parameters passed in will have no effect)

newParams

new parameter values
final def setReader(r: Reader[_]): OpWorkflowModel.this.type

Set data reader that will be used to generate data frame for stages

Set data reader that will be used to generate data frame for stages

r

reader for workflow

returns

this workflow

Definition Classes

OpWorkflowCore
var stages: Array[OPStage]

Attributes

protected

Definition Classes

OpWorkflowCore
def summary(): String

Extracts all summary metadata from transformers in JSON format

Extracts all summary metadata from transformers in JSON format

returns

json string summary
def summaryJson(): JValue

Extracts all summary metadata from transformers in JSON format

Extracts all summary metadata from transformers in JSON format

returns

json summary
def summaryPretty(insights: ModelInsights = ..., topK: Int = 15): String

Generated high level model summary in a compact print friendly format containing: selected model info, model evaluation results and feature correlations/contributions/cramersV values.

Generated high level model summary in a compact print friendly format containing: selected model info, model evaluation results and feature correlations/contributions/cramersV values.

insights

model insights to compute the summary against

topK

top K of feature correlations/contributions/cramersV values to print

returns

high level model summary in a compact print friendly format
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes

AnyRef
def toString(): String

Definition Classes

AnyRef → Any
val trainingParams: OpParams

params that were used during model training
val uid: String

unique identifier for this workflow model

unique identifier for this workflow model

Definition Classes

OpWorkflowModel → OpWorkflowCore
final def wait(): Unit

Definition Classes

AnyRef

Annotations

@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes

AnyRef

Annotations

@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes

AnyRef

Annotations

@throws( ... )
final def withWorkflowCV: OpWorkflowModel.this.type

:: Experimental :: Decides whether the cross-validation/train-validation-split will be done at workflow level This will remove issues with data leakage, however it will impact the runtime

:: Experimental :: Decides whether the cross-validation/train-validation-split will be done at workflow level This will remove issues with data leakage, however it will impact the runtime

returns

this workflow that will train part of the DAG in the cross-validation/train validation split

Definition Classes

OpWorkflowCore

Annotations

@Experimental()

Related Docs: object OpWorkflowModel | package op

class OpWorkflowModel extends OpWorkflowCore

Instance Constructors

new OpWorkflowModel(uid: String = UID[OpWorkflowModel], trainingParams: OpParams)

Value Members

final def !=(arg0: Any): Boolean

final def ##(): Int

final def ==(arg0: Any): Boolean

def applyTransformationsDAG(rawData: DataFrame, dag: StagesDAG, persistEveryKStages: Int)(implicit spark: SparkSession): DataFrame

final def asInstanceOf[T0]: T0

var blocklistedFeatures: Array[OPFeature]

var blocklistedMapKeys: Map[String, Set[String]]

def checkReadersAndFeatures(): Unit

def checkUnmatchedFeatures(): Unit

def clone(): AnyRef

def computeDataUpTo(feature: OPFeature, persistEveryKStages: Int = OpWorkflowModel.PersistEveryKStages)(implicit spark: SparkSession): DataFrame

def computeDataUpTo(feature: OPFeature, path: String)(implicit spark: SparkSession): Unit

def copy(): OpWorkflowModel

final def eq(arg0: AnyRef): Boolean

def equals(arg0: Any): Boolean

def evaluate[T <: EvaluationMetrics](evaluator: OpEvaluatorBase[T], metricsPath: Option[String] = None, scoresPath: Option[String] = None)(implicit arg0: ClassTag[T], spark: SparkSession): T

def finalize(): Unit

def findOriginStageId(feature: OPFeature): Option[Int]

def generateRawData()(implicit spark: SparkSession): DataFrame

final def getAllFeatures(): Array[OPFeature]

final def getBlocklist(): Array[OPFeature]

final def getBlocklistMapKeys(): Map[String, Set[String]]

final def getClass(): Class[_]

def getMetadata(features: OPFeature*): Map[OPFeature, Metadata]

def getOriginStageOf[T <: FeatureType](feature: FeatureLike[T]): OpPipelineStage[T]

final def getParameters(): OpParams

final def getRawFeatureDistributions(): Seq[FeatureDistribution]

final def getRawFeatureFilterResults(): RawFeatureFilterResults

final def getRawFeatures(): Array[OPFeature]

final def getRawScoringFeatureDistributions(): Seq[FeatureDistribution]

final def getRawTrainingFeatureDistributions(): Seq[FeatureDistribution]

final def getReader(): Reader[_]

final def getResultFeatures(): Array[OPFeature]

final def getStages(): Array[OPStage]

def getUpdatedFeatures(features: Array[OPFeature]): Array[OPFeature]

def hashCode(): Int

final def isInstanceOf[T0]: Boolean

final def isWorkflowCV: Boolean

var isWorkflowCVEnabled: Boolean

lazy val log: Logger

def modelInsights(feature: OPFeature): ModelInsights

final def ne(arg0: AnyRef): Boolean

final def notify(): Unit

final def notifyAll(): Unit

var parameters: OpParams

var rawFeatureFilterResults: RawFeatureFilterResults

var rawFeatures: Array[OPFeature]

var reader: Option[Reader[_]]

var resultFeatures: Array[OPFeature]

def save(path: String, overwrite: Boolean = true, modelStagingDir: String = WorkflowFileReader.modelStagingDir): Unit

def setBlocklist(features: Array[OPFeature]): OpWorkflowModel.this.type

def setBlocklistMapKeys(mapKeys: Map[String, Set[String]]): OpWorkflowModel.this.type

def setFeatures(features: Array[OPFeature]): OpWorkflowModel.this.type

final def setInputDataset[T](ds: Dataset[T], key: (T) ⇒ String = ReaderKey.randomKey)(implicit arg0: scala.reflect.api.JavaUniverse.WeakTypeTag[T]): OpWorkflowModel.this.type

final def setInputRDD[T](rdd: RDD[T], key: (T) ⇒ String = ReaderKey.randomKey)(implicit arg0: scala.reflect.api.JavaUniverse.WeakTypeTag[T]): OpWorkflowModel.this.type

final def setParameters(newParams: OpParams): OpWorkflowModel.this.type

final def setReader(r: Reader[_]): OpWorkflowModel.this.type

var stages: Array[OPStage]

def summary(): String

def summaryJson(): JValue

def summaryPretty(insights: ModelInsights = ..., topK: Int = 15): String

final def synchronized[T0](arg0: ⇒ T0): T0

def toString(): String

val trainingParams: OpParams

val uid: String

final def wait(): Unit

final def wait(arg0: Long, arg1: Int): Unit

final def wait(arg0: Long): Unit

final def withWorkflowCV: OpWorkflowModel.this.type

Inherited from OpWorkflowCore

Inherited from AnyRef

Inherited from Any

Ungrouped