OpWorkflow

Instance Constructors

new OpWorkflow(uid: String = UID[OpWorkflow])

uid

unique id for the workflow

Value Members

final def !=(arg0: Any): Boolean

Definition Classes

AnyRef → Any
final def ##(): Int

Definition Classes

AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes

AnyRef → Any
def applyTransformationsDAG(rawData: DataFrame, dag: StagesDAG, persistEveryKStages: Int)(implicit spark: SparkSession): DataFrame

Efficiently applies all fitted stages grouping by level in the DAG where possible

Efficiently applies all fitted stages grouping by level in the DAG where possible

rawData

data to transform

dag

computation graph

persistEveryKStages

breaks in computation to persist

spark

spark session

returns

transformed dataframe

Attributes

protected

Definition Classes

OpWorkflowCore
final def asInstanceOf[T0]: T0

Definition Classes

Any
var blocklistedFeatures: Array[OPFeature]

Attributes

protected

Definition Classes

OpWorkflowCore
var blocklistedMapKeys: Map[String, Set[String]]

Attributes

protected

Definition Classes

OpWorkflowCore
def checkReadersAndFeatures(): Unit

Check that readers and features are set and that params match them

Check that readers and features are set and that params match them

Attributes

protected

Definition Classes

OpWorkflowCore
def checkUnmatchedFeatures(): Unit

Determine if any of the raw features do not have a matching reader

Determine if any of the raw features do not have a matching reader

Attributes

protected

Definition Classes

OpWorkflowCore
def clone(): AnyRef

Attributes

protected[java.lang]

Definition Classes

AnyRef

Annotations

@throws( ... )
def computeDataUpTo(feature: OPFeature, persistEveryKStages: Int = OpWorkflowModel.PersistEveryKStages)(implicit spark: SparkSession): DataFrame

Returns a dataframe containing all the columns generated up to and including the feature input

Returns a dataframe containing all the columns generated up to and including the feature input

feature

input feature to compute up to

persistEveryKStages

persist data in transforms every k stages for performance improvement

returns

Dataframe containing columns corresponding to all of the features generated up to the feature given

Definition Classes

OpWorkflow → OpWorkflowCore
def computeDataUpTo(feature: OPFeature, path: String)(implicit spark: SparkSession): Unit

Computes a dataframe containing all the columns generated up to the feature input and saves it to the specified path in avro format

Computes a dataframe containing all the columns generated up to the feature input and saves it to the specified path in avro format

Definition Classes

OpWorkflowCore
final def eq(arg0: AnyRef): Boolean

Definition Classes

AnyRef
def equals(arg0: Any): Boolean

Definition Classes

AnyRef → Any
def finalize(): Unit

Attributes

protected[java.lang]

Definition Classes

AnyRef

Annotations

@throws( classOf[java.lang.Throwable] )
def findOriginStageId(feature: OPFeature): Option[Int]

Looks at model parents to match parent stage for features (since features are created from the estimator not the fitted transformer)

Looks at model parents to match parent stage for features (since features are created from the estimator not the fitted transformer)

feature

feature want to find origin stage for

returns

index of the parent stage

Attributes

protected

Definition Classes

OpWorkflowCore
def fitStages(data: DataFrame, stagesToFit: Array[OPStage], persistEveryKStages: Int)(implicit spark: SparkSession): Array[OPStage]

Fit the estimators to return a sequence of only transformers Modified version of Spark 2.x Pipeline

Fit the estimators to return a sequence of only transformers Modified version of Spark 2.x Pipeline

data

dataframe to fit on

stagesToFit

stages that need to be converted to transformers

persistEveryKStages

persist data in transforms every k stages for performance improvement

returns

fitted transformers

Attributes

protected
def generateRawData()(implicit spark: SparkSession): DataFrame

Used to generate dataframe from reader and raw features list

Used to generate dataframe from reader and raw features list

returns

Dataframe with all the features generated + persisted

Attributes

protected

Definition Classes

OpWorkflow → OpWorkflowCore
final def getAllFeatures(): Array[OPFeature]

Get all the features that potentially are generated by the workflow: raw, intermediate and result features

Get all the features that potentially are generated by the workflow: raw, intermediate and result features

returns

all the features that potentially are generated by the workflow: raw, intermediate and result features

Definition Classes

OpWorkflowCore
final def getBlocklist(): Array[OPFeature]

Get the list of raw features which have been blocklisted

Get the list of raw features which have been blocklisted

returns

blocklisted features

Definition Classes

OpWorkflowCore
final def getBlocklistMapKeys(): Map[String, Set[String]]

Get the list of Map Keys which have been blocklisted

Get the list of Map Keys which have been blocklisted

returns

blocklisted map keys

Definition Classes

OpWorkflowCore
final def getClass(): Class[_]

Definition Classes

AnyRef → Any
final def getParameters(): OpParams

Get the parameter settings passed into the workflow

Get the parameter settings passed into the workflow

returns

OpWorkflowParams set for this workflow

Definition Classes

OpWorkflowCore
final def getRawFeatureDistributions(): Seq[FeatureDistribution]

Get raw feature distribution information computed on training and scoring data during raw feature filter

Get raw feature distribution information computed on training and scoring data during raw feature filter

returns

sequence of feature distribution information

Definition Classes

OpWorkflowCore
final def getRawFeatureFilterResults(): RawFeatureFilterResults

Get raw feature filter results (filter configuration, feature distributions, and feature exclusion reasons)

Get raw feature filter results (filter configuration, feature distributions, and feature exclusion reasons)

returns

raw feature filter results

Definition Classes

OpWorkflowCore
final def getRawFeatures(): Array[OPFeature]

Get the raw features generated by the workflow

Get the raw features generated by the workflow

returns

raw features for workflow

Definition Classes

OpWorkflowCore
final def getRawScoringFeatureDistributions(): Seq[FeatureDistribution]

Get raw feature distribution information computed on scoring data during raw feature filter

Get raw feature distribution information computed on scoring data during raw feature filter

returns

sequence of feature distribution information

Definition Classes

OpWorkflowCore
final def getRawTrainingFeatureDistributions(): Seq[FeatureDistribution]

Get raw feature distribution information computed on training data during raw feature filter

Get raw feature distribution information computed on training data during raw feature filter

returns

sequence of feature distribution information

Definition Classes

OpWorkflowCore
final def getReader(): Reader[_]

Get data reader that will be used to generate data frame for stages

Get data reader that will be used to generate data frame for stages

returns

reader for workflow

Definition Classes

OpWorkflowCore
final def getResultFeatures(): Array[OPFeature]

Get the final features generated by the workflow

Get the final features generated by the workflow

returns

result features for workflow

Definition Classes

OpWorkflowCore
final def getStages(): Array[OPStage]

Get the stages used in this workflow

Get the stages used in this workflow

returns

stages in the workflow

Definition Classes

OpWorkflowCore
def hashCode(): Int

Definition Classes

AnyRef → Any
final def isInstanceOf[T0]: Boolean

Definition Classes

Any
final def isWorkflowCV: Boolean

Whether the cross-validation/train-validation-split will be done at workflow level

Whether the cross-validation/train-validation-split will be done at workflow level

returns

true if the cross-validation will be done at workflow level, false otherwise

Definition Classes

OpWorkflowCore
var isWorkflowCVEnabled: Boolean

Attributes

protected

Definition Classes

OpWorkflowCore
def loadModel(path: String, asSpark: Boolean = true, modelStagingDir: String = WorkflowFileReader.modelStagingDir): OpWorkflowModel

Load a previously trained workflow model from path

Load a previously trained workflow model from path

path

to the trained workflow model

asSpark

load the transformers as spark native or mleap transformers and tmog transformers

modelStagingDir

local folder to copy and unpack stored model to for loading

returns

workflow model
lazy val log: Logger

Attributes

protected

Definition Classes

OpWorkflowCore
final def ne(arg0: AnyRef): Boolean

Definition Classes

AnyRef
final def notify(): Unit

Definition Classes

AnyRef
final def notifyAll(): Unit

Definition Classes

AnyRef
var parameters: OpParams

Attributes

protected

Definition Classes

OpWorkflowCore
var rawFeatureFilterResults: RawFeatureFilterResults

Attributes

protected

Definition Classes

OpWorkflowCore
var rawFeatures: Array[OPFeature]

Attributes

protected

Definition Classes

OpWorkflowCore
var reader: Option[Reader[_]]

Attributes

protected

Definition Classes

OpWorkflowCore
var resultFeatures: Array[OPFeature]

Attributes

protected

Definition Classes

OpWorkflowCore
def setBlocklistMapKeys(mapKeys: Map[String, Set[String]]): Unit

Attributes

protected[com.salesforce.op]
final def setInputDataset[T](ds: Dataset[T], key: (T) ⇒ String = ReaderKey.randomKey)(implicit arg0: scala.reflect.api.JavaUniverse.WeakTypeTag[T]): OpWorkflow.this.type

Set input dataset which contains columns corresponding to the raw features used in the workflow The type of the dataset (Dataset[T]) must match the type of the FeatureBuilders[T] used to generate the raw features

Set input dataset which contains columns corresponding to the raw features used in the workflow The type of the dataset (Dataset[T]) must match the type of the FeatureBuilders[T] used to generate the raw features

ds

input dataset for workflow

key

key extract function

returns

this workflow

Definition Classes

OpWorkflowCore
final def setInputRDD[T](rdd: RDD[T], key: (T) ⇒ String = ReaderKey.randomKey)(implicit arg0: scala.reflect.api.JavaUniverse.WeakTypeTag[T]): OpWorkflow.this.type

Set input rdd which contains columns corresponding to the raw features used in the workflow The type of the rdd (RDD[T]) must match the type of the FeatureBuilders[T] used to generate the raw features

Set input rdd which contains columns corresponding to the raw features used in the workflow The type of the rdd (RDD[T]) must match the type of the FeatureBuilders[T] used to generate the raw features

rdd

input rdd for workflow

key

key extract function

returns

this workflow

Definition Classes

OpWorkflowCore
final def setParameters(newParams: OpParams): OpWorkflow.this.type

Set stage and reader parameters from OpWorkflowParams object for run

Set stage and reader parameters from OpWorkflowParams object for run

newParams

new parameter values

returns

this workflow
final def setReader(r: Reader[_]): OpWorkflow.this.type

Set data reader that will be used to generate data frame for stages

Set data reader that will be used to generate data frame for stages

r

reader for workflow

returns

this workflow

Definition Classes

OpWorkflowCore
def setResultFeatures(features: OPFeature*): OpWorkflow.this.type

This is used to set the stages of the workflow.

This is used to set the stages of the workflow.

By setting the final features the stages used to generate them can be traced back through the parent features and origin stages. The input is a tuple of features to support leaf feature generation (multiple endpoints in feature generation).

features

Final features generated by the workflow
var stages: Array[OPStage]

Attributes

protected

Definition Classes

OpWorkflowCore
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes

AnyRef
def toString(): String

Definition Classes

AnyRef → Any
def train(persistEveryKStages: Int = OpWorkflowModel.PersistEveryKStages)(implicit spark: SparkSession): OpWorkflowModel

Fit all of the estimators in the pipeline and return a pipeline model of only transformers.

Fit all of the estimators in the pipeline and return a pipeline model of only transformers. Uses data loaded as specified by the data reader to generate the initial data set.

persistEveryKStages

persist data in transforms every k stages for performance improvement

returns

a fitted pipeline model
val uid: String

unique id for the workflow

unique id for the workflow

Definition Classes

OpWorkflow → OpWorkflowCore
final def wait(): Unit

Definition Classes

AnyRef

Annotations

@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes

AnyRef

Annotations

@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes

AnyRef

Annotations

@throws( ... )
def withModelStages(model: OpWorkflowModel): OpWorkflow.this.type

Replaces any estimators in this workflow with their corresponding fit models from the OpWorkflowModel passed in.

Replaces any estimators in this workflow with their corresponding fit models from the OpWorkflowModel passed in. Note that the Stages UIDs must EXACTLY correspond in order to be replaced so the same features and stages must be used in both the fitted OpWorkflowModel and this OpWorkflow. Any estimators that are not part of the OpWorkflowModel passed in will be trained when .train() is called on this OpWorkflow.

model

model containing fitted stages to be used in this workflow

returns

an OpWorkflow containing all of the stages from this model plus any new stages needed to generate the features not included in the fitted model
def withRawFeatureFilter[T](trainingReader: Option[Reader[T]], scoringReader: Option[Reader[T]], bins: Int = 100, minFillRate: Double = 0.001, maxFillDifference: Double = 0.90, maxFillRatioDiff: Double = 20.0, maxJSDivergence: Double = 0.90, maxCorrelation: Double = 0.95, correlationType: CorrelationType = CorrelationType.Pearson, protectedFeatures: Array[OPFeature] = Array.empty, protectedJSFeatures: Array[OPFeature] = Array.empty, textBinsFormula: (Summary, Int) ⇒ Int = RawFeatureFilter.textBinsFormula, timePeriod: Option[TimePeriod] = None, minScoringRows: Int = ..., resultFeatureRetentionPolicy: ResultFeatureRetention = ResultFeatureRetention.Strict): OpWorkflow.this.type

Add a raw features filter to the workflow to look at fill rates and distributions of raw features and exclude features that do not meet specifications from modeling DAG

Add a raw features filter to the workflow to look at fill rates and distributions of raw features and exclude features that do not meet specifications from modeling DAG

T

Type of the data read in

trainingReader

training reader to use in filter if not supplied will fall back to reader specified for workflow (note that this reader will take precedence over readers directly input to the workflow if both are supplied)

scoringReader

scoring reader to use in filter if not supplied will do the checks possible with only training data available

bins

number of bins to use in estimating feature distributions

minFillRate

minimum non-null fraction of instances that a feature should contain

maxFillDifference

maximum absolute difference in fill rate between scoring and training data for a feature

maxFillRatioDiff

maximum difference in fill ratio (symmetric) between scoring and training data for a feature

maxJSDivergence

maximum Jensen-Shannon divergence between the training and scoring distributions for a feature

protectedFeatures

list of features that should never be removed (features that are used to create them will also be protected)

protectedJSFeatures

features that are protected from removal by JS divergence check

textBinsFormula

formula to compute the text features bin size. Input arguments are Summary and number of bins to use in computing feature distributions (histograms for numerics, hashes for strings). Output is the bins for the text features.

timePeriod

Time period used to apply circulate date transformation for date features, if not specified will use numeric feature transformation

minScoringRows

Minimum row threshold for scoring set comparisons to be used in checks. If the scoring set size is below this threshold, then only training data checks will be used

Annotations

@Experimental()
final def withWorkflowCV: OpWorkflow.this.type

:: Experimental :: Decides whether the cross-validation/train-validation-split will be done at workflow level This will remove issues with data leakage, however it will impact the runtime

:: Experimental :: Decides whether the cross-validation/train-validation-split will be done at workflow level This will remove issues with data leakage, however it will impact the runtime

returns

this workflow that will train part of the DAG in the cross-validation/train validation split

Definition Classes

OpWorkflowCore

Annotations

@Experimental()

Related Doc: package op

class OpWorkflow extends OpWorkflowCore

Instance Constructors

new OpWorkflow(uid: String = UID[OpWorkflow])

Value Members

final def !=(arg0: Any): Boolean

final def ##(): Int

final def ==(arg0: Any): Boolean

def applyTransformationsDAG(rawData: DataFrame, dag: StagesDAG, persistEveryKStages: Int)(implicit spark: SparkSession): DataFrame

final def asInstanceOf[T0]: T0

var blocklistedFeatures: Array[OPFeature]

var blocklistedMapKeys: Map[String, Set[String]]

def checkReadersAndFeatures(): Unit

def checkUnmatchedFeatures(): Unit

def clone(): AnyRef

def computeDataUpTo(feature: OPFeature, persistEveryKStages: Int = OpWorkflowModel.PersistEveryKStages)(implicit spark: SparkSession): DataFrame

def computeDataUpTo(feature: OPFeature, path: String)(implicit spark: SparkSession): Unit

final def eq(arg0: AnyRef): Boolean

def equals(arg0: Any): Boolean

def finalize(): Unit

def findOriginStageId(feature: OPFeature): Option[Int]

def fitStages(data: DataFrame, stagesToFit: Array[OPStage], persistEveryKStages: Int)(implicit spark: SparkSession): Array[OPStage]

def generateRawData()(implicit spark: SparkSession): DataFrame

final def getAllFeatures(): Array[OPFeature]

final def getBlocklist(): Array[OPFeature]

final def getBlocklistMapKeys(): Map[String, Set[String]]

final def getClass(): Class[_]

final def getParameters(): OpParams

final def getRawFeatureDistributions(): Seq[FeatureDistribution]

final def getRawFeatureFilterResults(): RawFeatureFilterResults

final def getRawFeatures(): Array[OPFeature]

final def getRawScoringFeatureDistributions(): Seq[FeatureDistribution]

final def getRawTrainingFeatureDistributions(): Seq[FeatureDistribution]

final def getReader(): Reader[_]

final def getResultFeatures(): Array[OPFeature]

final def getStages(): Array[OPStage]

def hashCode(): Int

final def isInstanceOf[T0]: Boolean

final def isWorkflowCV: Boolean

var isWorkflowCVEnabled: Boolean

def loadModel(path: String, asSpark: Boolean = true, modelStagingDir: String = WorkflowFileReader.modelStagingDir): OpWorkflowModel

lazy val log: Logger

final def ne(arg0: AnyRef): Boolean

final def notify(): Unit

final def notifyAll(): Unit

var parameters: OpParams

var rawFeatureFilterResults: RawFeatureFilterResults

var rawFeatures: Array[OPFeature]

var reader: Option[Reader[_]]

var resultFeatures: Array[OPFeature]

def setBlocklistMapKeys(mapKeys: Map[String, Set[String]]): Unit

final def setInputDataset[T](ds: Dataset[T], key: (T) ⇒ String = ReaderKey.randomKey)(implicit arg0: scala.reflect.api.JavaUniverse.WeakTypeTag[T]): OpWorkflow.this.type

final def setInputRDD[T](rdd: RDD[T], key: (T) ⇒ String = ReaderKey.randomKey)(implicit arg0: scala.reflect.api.JavaUniverse.WeakTypeTag[T]): OpWorkflow.this.type

final def setParameters(newParams: OpParams): OpWorkflow.this.type

final def setReader(r: Reader[_]): OpWorkflow.this.type

def setResultFeatures(features: OPFeature*): OpWorkflow.this.type

var stages: Array[OPStage]

final def synchronized[T0](arg0: ⇒ T0): T0

def toString(): String

def train(persistEveryKStages: Int = OpWorkflowModel.PersistEveryKStages)(implicit spark: SparkSession): OpWorkflowModel

val uid: String

final def wait(): Unit

final def wait(arg0: Long, arg1: Int): Unit

final def wait(arg0: Long): Unit

def withModelStages(model: OpWorkflowModel): OpWorkflow.this.type

final def withWorkflowCV: OpWorkflow.this.type

Inherited from OpWorkflowCore

Inherited from AnyRef

Inherited from Any

Ungrouped