org.apache.spark.ml.classification.RandomForestClassificationModel

All Implemented Interfaces:: Serializable, org.apache.spark.internal.Logging, ClassifierParams, ProbabilisticClassifierParams, Params, HasCheckpointInterval, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasProbabilityCol, HasRawPredictionCol, HasSeed, HasThresholds, HasWeightCol, PredictorParams, DecisionTreeParams, RandomForestClassifierParams, RandomForestParams, TreeClassifierParams, TreeEnsembleClassifierParams, TreeEnsembleModel<DecisionTreeClassificationModel>, TreeEnsembleParams, HasTrainingSummary<RandomForestClassificationTrainingSummary>, Identifiable, MLWritable

public class RandomForestClassificationModel extends ProbabilisticClassificationModel<Vector,RandomForestClassificationModel> implements RandomForestClassifierParams, TreeEnsembleModel<DecisionTreeClassificationModel>, MLWritable, Serializable, HasTrainingSummary<RandomForestClassificationTrainingSummary>

Random Forest model for classification. It supports both binary and multiclass labels, as well as both continuous and categorical features.

param: _trees Decision trees in the ensemble. Warning: These have null parents.

See Also:

Serialized Form

Nested Class Summary

Nested classes/interfaces inherited from interface org.apache.spark.internal.Logging
org.apache.spark.internal.Logging.LogStringContext, org.apache.spark.internal.Logging.SparkShellLoggingFilter
Method Summary

Modifier and Type

Method

Description

BinaryRandomForestClassificationTrainingSummary

binarySummary()

Gets summary of model on training set.

final BooleanParam

bootstrap()

Whether bootstrap samples are used when building trees.

final BooleanParam

cacheNodeIds()

If false, the algorithm will pass trees to executors to match instances with nodes.

final IntParam

checkpointInterval()

Param for set checkpoint interval (>= 1) or disable checkpoint (-1).

RandomForestClassificationModel

copy(ParamMap extra)

Creates a copy of this instance with the same UID and some extra params.

long

estimatedSize()

RandomForestClassificationSummary

evaluate(Dataset<?> dataset)

Evaluates the model on a test dataset.

Vector

featureImportances()

final Param<String>

featureSubsetStrategy()

The number of features to consider for splits at each tree node.

final Param<String>

impurity()

Criterion used for information gain calculation (case-insensitive).

final Param<String>

leafCol()

Leaf indices column name.

static RandomForestClassificationModel

load(String path)

final IntParam

maxBins()

Maximum number of bins used for discretizing continuous features and for choosing how to split on features at each node.

final IntParam

maxDepth()

Maximum depth of the tree (nonnegative).

final IntParam

maxMemoryInMB()

Maximum memory in MB allocated to histogram aggregation.

final DoubleParam

minInfoGain()

Minimum information gain for a split to be considered at a tree node.

final IntParam

minInstancesPerNode()

Minimum number of instances each child must have after split.

final DoubleParam

minWeightFractionPerNode()

Minimum fraction of the weighted sample count that each child must have after split.

int

numClasses()

Number of classes (values which the label can take).

int

numFeatures()

Returns the number of features the model was trained on.

final IntParam

numTrees()

Number of trees to train (at least 1).

Vector

predictRaw(Vector features)

Raw prediction for each possible label.

static MLReader<RandomForestClassificationModel>

read()

final LongParam

seed()

Param for random seed.

final DoubleParam

subsamplingRate()

Fraction of the training data used for learning each decision tree, in range (0, 1].

RandomForestClassificationTrainingSummary

summary()

Gets summary of model on training set.

String

toString()

Summary of the model

int

totalNumNodes()

Total number of nodes, summed over all trees in the ensemble.

Dataset<Row>

transform(Dataset<?> dataset)

Transforms dataset by reading from PredictionModel.featuresCol(), and appending new columns as specified by parameters: - predicted labels as PredictionModel.predictionCol() of type Double - raw predictions (confidences) as ClassificationModel.rawPredictionCol() of type Vector - probability of each class as ProbabilisticClassificationModel.probabilityCol() of type Vector.

StructType

transformSchema(StructType schema)

Check transform validity and derive the output schema from the input schema.

DecisionTreeClassificationModel[]

trees()

Trees in this ensemble.

double[]

treeWeights()

Weights for each tree, zippable with TreeEnsembleModel.trees()

String

uid()

An immutable unique ID for the object and its derivatives.

final Param<String>

weightCol()

Param for weight column name.

MLWriter

write()

Returns an MLWriter instance for this ML instance.

Methods inherited from class org.apache.spark.ml.classification.ProbabilisticClassificationModel
normalizeToProbabilitiesInPlace, predictProbability, probabilityCol, setProbabilityCol, setThresholds, thresholds

Methods inherited from class org.apache.spark.ml.classification.ClassificationModel
predict, rawPredictionCol, setRawPredictionCol, transformImpl

Methods inherited from class org.apache.spark.ml.PredictionModel
featuresCol, labelCol, predictionCol, setFeaturesCol, setPredictionCol

Methods inherited from class org.apache.spark.ml.Model
hasParent, parent, setParent

Methods inherited from class org.apache.spark.ml.Transformer
transform, transform, transform

Methods inherited from class org.apache.spark.ml.PipelineStage
params

Methods inherited from class java.lang.Object
equals, getClass, hashCode, notify, notifyAll, wait, wait, wait

Methods inherited from interface org.apache.spark.ml.tree.DecisionTreeParams
getCacheNodeIds, getLeafCol, getMaxBins, getMaxDepth, getMaxMemoryInMB, getMinInfoGain, getMinInstancesPerNode, getMinWeightFractionPerNode, getOldStrategy, setLeafCol

Methods inherited from interface org.apache.spark.ml.param.shared.HasCheckpointInterval
getCheckpointInterval

Methods inherited from interface org.apache.spark.ml.param.shared.HasFeaturesCol
featuresCol, getFeaturesCol

Methods inherited from interface org.apache.spark.ml.param.shared.HasLabelCol
getLabelCol, labelCol

Methods inherited from interface org.apache.spark.ml.param.shared.HasPredictionCol
getPredictionCol, predictionCol

Methods inherited from interface org.apache.spark.ml.param.shared.HasProbabilityCol
getProbabilityCol, probabilityCol

Methods inherited from interface org.apache.spark.ml.param.shared.HasRawPredictionCol
getRawPredictionCol, rawPredictionCol

Methods inherited from interface org.apache.spark.ml.param.shared.HasSeed
getSeed

Methods inherited from interface org.apache.spark.ml.param.shared.HasThresholds
getThresholds, thresholds

Methods inherited from interface org.apache.spark.ml.util.HasTrainingSummary
hasSummary, setSummary

Methods inherited from interface org.apache.spark.ml.param.shared.HasWeightCol
getWeightCol

Methods inherited from interface org.apache.spark.internal.Logging
initializeForcefully, initializeLogIfNecessary, initializeLogIfNecessary, initializeLogIfNecessary$default$2, isTraceEnabled, log, logBasedOnLevel, logDebug, logDebug, logDebug, logDebug, logError, logError, logError, logError, logInfo, logInfo, logInfo, logInfo, logName, LogStringContext, logTrace, logTrace, logTrace, logTrace, logWarning, logWarning, logWarning, logWarning, MDC, org$apache$spark$internal$Logging$$log_, org$apache$spark$internal$Logging$$log__$eq, withLogContext

Methods inherited from interface org.apache.spark.ml.util.MLWritable
save

Methods inherited from interface org.apache.spark.ml.param.Params
clear, copyValues, defaultCopy, defaultParamMap, estimateMatadataSize, explainParam, explainParams, extractParamMap, extractParamMap, get, getDefault, getOrDefault, getParam, hasDefault, hasParam, isDefined, isSet, onParamChange, paramMap, params, set, set, set, setDefault, setDefault, shouldOwn

Methods inherited from interface org.apache.spark.ml.tree.RandomForestParams
getBootstrap, getNumTrees

Methods inherited from interface org.apache.spark.ml.tree.TreeClassifierParams
getImpurity, getOldImpurity

Methods inherited from interface org.apache.spark.ml.tree.TreeEnsembleClassifierParams
validateAndTransformSchema

Methods inherited from interface org.apache.spark.ml.tree.TreeEnsembleModel
getEstimatedSize, getLeafField, getTree, javaTreeWeights, predictLeaf, toDebugString

Methods inherited from interface org.apache.spark.ml.tree.TreeEnsembleParams
getFeatureSubsetStrategy, getOldStrategy, getSubsamplingRate

Method Details
- read
  
  public static MLReader<RandomForestClassificationModel> read()
- load
  
  public static RandomForestClassificationModel load(String path)
- totalNumNodes
  
  public int totalNumNodes()
  
  Description copied from interface: TreeEnsembleModel
  
  Total number of nodes, summed over all trees in the ensemble.
  
  Specified by:
  
  totalNumNodes in interface TreeEnsembleModel<DecisionTreeClassificationModel>
- impurity
  
  public final Param<String> impurity()
  
  Description copied from interface: TreeClassifierParams
  
  Criterion used for information gain calculation (case-insensitive). This impurity type is used in DecisionTreeClassifier and RandomForestClassifier, Supported: "entropy" and "gini". (default = gini)
  
  Specified by:
  
  impurity in interface TreeClassifierParams
  
  Returns:
  
  (undocumented)
- numTrees
  
  public final IntParam numTrees()
  
  Description copied from interface: RandomForestParams
  
  Number of trees to train (at least 1). If 1, then no bootstrapping is used. If greater than 1, then bootstrapping is done. TODO: Change to always do bootstrapping (simpler). SPARK-7130 (default = 20)
  Note: The reason that we cannot add this to both GBT and RF (i.e. in TreeEnsembleParams) is the param maxIter controls how many trees a GBT has. The semantics in the algorithms are a bit different.
  
  Specified by:
  
  numTrees in interface RandomForestParams
  
  Returns:
  
  (undocumented)
- bootstrap
  
  public final BooleanParam bootstrap()
  
  Description copied from interface: RandomForestParams
  
  Whether bootstrap samples are used when building trees.
  
  Specified by:
  
  bootstrap in interface RandomForestParams
  
  Returns:
  
  (undocumented)
- subsamplingRate
  
  public final DoubleParam subsamplingRate()
  
  Description copied from interface: TreeEnsembleParams
  
  Fraction of the training data used for learning each decision tree, in range (0, 1]. (default = 1.0)
  
  Specified by:
  
  subsamplingRate in interface TreeEnsembleParams
  
  Returns:
  
  (undocumented)
- featureSubsetStrategy
  
  public final Param<String> featureSubsetStrategy()
  
  Description copied from interface: TreeEnsembleParams
  
  The number of features to consider for splits at each tree node. Supported options: - "auto": Choose automatically for task: If numTrees == 1, set to "all." If numTrees greater than 1 (forest), set to "sqrt" for classification and to "onethird" for regression. - "all": use all features - "onethird": use 1/3 of the features - "sqrt": use sqrt(number of features) - "log2": use log2(number of features) - "n": when n is in the range (0, 1.0], use n * number of features. When n is in the range (1, number of features), use n features. (default = "auto")
  These various settings are based on the following references: - log2: tested in Breiman (2001) - sqrt: recommended by Breiman manual for random forests - The defaults of sqrt (classification) and onethird (regression) match the R randomForest package.
  Specified by:
  
  featureSubsetStrategy in interface TreeEnsembleParams
  
  Returns:
  
  (undocumented)
  
  See Also:
  
  Breiman (2001)
  
  Breiman manual for random forests
- leafCol
  
  public final Param<String> leafCol()
  
  Description copied from interface: DecisionTreeParams
  
  Leaf indices column name. Predicted leaf index of each instance in each tree by preorder. (default = "")
  
  Specified by:
  
  leafCol in interface DecisionTreeParams
  
  Returns:
  
  (undocumented)
- maxDepth
  
  public final IntParam maxDepth()
  
  Description copied from interface: DecisionTreeParams
  
  Maximum depth of the tree (nonnegative). E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default = 5)
  
  Specified by:
  
  maxDepth in interface DecisionTreeParams
  
  Returns:
  
  (undocumented)
- maxBins
  
  public final IntParam maxBins()
  
  Description copied from interface: DecisionTreeParams
  
  Maximum number of bins used for discretizing continuous features and for choosing how to split on features at each node. More bins give higher granularity. Must be at least 2 and at least number of categories in any categorical feature. (default = 32)
  
  Specified by:
  
  maxBins in interface DecisionTreeParams
  
  Returns:
  
  (undocumented)
- minInstancesPerNode
  
  public final IntParam minInstancesPerNode()
  
  Description copied from interface: DecisionTreeParams
  
  Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Must be at least 1. (default = 1)
  
  Specified by:
  
  minInstancesPerNode in interface DecisionTreeParams
  
  Returns:
  
  (undocumented)
- minWeightFractionPerNode
  
  public final DoubleParam minWeightFractionPerNode()
  
  Description copied from interface: DecisionTreeParams
  
  Minimum fraction of the weighted sample count that each child must have after split. If a split causes the fraction of the total weight in the left or right child to be less than minWeightFractionPerNode, the split will be discarded as invalid. Should be in the interval [0.0, 0.5). (default = 0.0)
  
  Specified by:
  
  minWeightFractionPerNode in interface DecisionTreeParams
  
  Returns:
  
  (undocumented)
- minInfoGain
  
  public final DoubleParam minInfoGain()
  
  Description copied from interface: DecisionTreeParams
  
  Minimum information gain for a split to be considered at a tree node. Should be at least 0.0. (default = 0.0)
  
  Specified by:
  
  minInfoGain in interface DecisionTreeParams
  
  Returns:
  
  (undocumented)
- maxMemoryInMB
  
  public final IntParam maxMemoryInMB()
  
  Description copied from interface: DecisionTreeParams
  
  Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size. (default = 256 MB)
  
  Specified by:
  
  maxMemoryInMB in interface DecisionTreeParams
  
  Returns:
  
  (undocumented)
- cacheNodeIds
  
  public final BooleanParam cacheNodeIds()
  
  Description copied from interface: DecisionTreeParams
  
  If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval. (default = false)
  
  Specified by:
  
  cacheNodeIds in interface DecisionTreeParams
  
  Returns:
  
  (undocumented)
- weightCol
  
  public final Param<String> weightCol()
  
  Description copied from interface: HasWeightCol
  
  Param for weight column name. If this is not set or empty, we treat all instance weights as 1.0.
  
  Specified by:
  
  weightCol in interface HasWeightCol
  
  Returns:
  
  (undocumented)
- seed
  
  public final LongParam seed()
  
  Description copied from interface: HasSeed
  
  Param for random seed.
  
  Specified by:
  
  seed in interface HasSeed
  
  Returns:
  
  (undocumented)
- checkpointInterval
  
  public final IntParam checkpointInterval()
  
  Description copied from interface: HasCheckpointInterval
  
  Param for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.
  
  Specified by:
  
  checkpointInterval in interface HasCheckpointInterval
  
  Returns:
  
  (undocumented)
- uid
  
  public String uid()
  
  Description copied from interface: Identifiable
  
  An immutable unique ID for the object and its derivatives.
  
  Specified by:
  
  uid in interface Identifiable
  
  Returns:
  
  (undocumented)
- numFeatures
  
  public int numFeatures()
  
  Description copied from class: PredictionModel
  
  Returns the number of features the model was trained on. If unknown, returns -1
  
  Overrides:
  
  numFeatures in class PredictionModel<Vector,RandomForestClassificationModel>
- numClasses
  
  public int numClasses()
  
  Description copied from class: ClassificationModel
  
  Number of classes (values which the label can take).
  
  Specified by:
  
  numClasses in class ClassificationModel<Vector,RandomForestClassificationModel>
- estimatedSize
  
  public long estimatedSize()
- trees
  
  public DecisionTreeClassificationModel[] trees()
  
  Description copied from interface: TreeEnsembleModel
  
  Trees in this ensemble. Warning: These have null parent Estimators.
  
  Specified by:
  
  trees in interface TreeEnsembleModel<DecisionTreeClassificationModel>
- treeWeights
  
  public double[] treeWeights()
  
  Description copied from interface: TreeEnsembleModel
  
  Weights for each tree, zippable with TreeEnsembleModel.trees()
  
  Specified by:
  
  treeWeights in interface TreeEnsembleModel<DecisionTreeClassificationModel>
- summary
  
  public RandomForestClassificationTrainingSummary summary()
  
  Gets summary of model on training set. An exception is thrown if hasSummary is false.
  
  Specified by:
  
  summary in interface HasTrainingSummary<RandomForestClassificationTrainingSummary>
  
  Returns:
  
  (undocumented)
- binarySummary
  
  public BinaryRandomForestClassificationTrainingSummary binarySummary()
  
  Gets summary of model on training set. An exception is thrown if hasSummary is false or it is a multiclass model.
  
  Returns:
  
  (undocumented)
- evaluate
  
  public RandomForestClassificationSummary evaluate(Dataset<?> dataset)
  
  Evaluates the model on a test dataset.
  
  Parameters:
  
  dataset - Test dataset to evaluate model on.
  
  Returns:
  
  (undocumented)
- transformSchema
  
  public StructType transformSchema(StructType schema)
  
  Description copied from class: PipelineStage
  
  Check transform validity and derive the output schema from the input schema.
  We check validity for interactions between parameters during transformSchema and raise an exception if any parameter value is invalid. Parameter value checks which do not depend on other parameters are handled by Param.validate().
  Typical implementation should first conduct verification on schema change and parameter validity, including complex parameter interaction checks.
  
  Overrides:
  
  transformSchema in class ProbabilisticClassificationModel<Vector,RandomForestClassificationModel>
  
  Parameters:
  
  schema - (undocumented)
  
  Returns:
  
  (undocumented)
- transform
  
  public Dataset<Row> transform(Dataset<?> dataset)
  
  Description copied from class: ProbabilisticClassificationModel
  
  Transforms dataset by reading from PredictionModel.featuresCol(), and appending new columns as specified by parameters: - predicted labels as PredictionModel.predictionCol() of type Double - raw predictions (confidences) as ClassificationModel.rawPredictionCol() of type Vector - probability of each class as ProbabilisticClassificationModel.probabilityCol() of type Vector.
  
  Overrides:
  
  transform in class ProbabilisticClassificationModel<Vector,RandomForestClassificationModel>
  
  Parameters:
  
  dataset - input dataset
  
  Returns:
  
  transformed dataset
- predictRaw
  
  public Vector predictRaw(Vector features)
  
  Description copied from class: ClassificationModel
  
  Raw prediction for each possible label. The meaning of a "raw" prediction may vary between algorithms, but it intuitively gives a measure of confidence in each possible label (where larger = more confident). This internal method is used to implement transform() and output ClassificationModel.rawPredictionCol().
  
  Specified by:
  
  predictRaw in class ClassificationModel<Vector,RandomForestClassificationModel>
  
  Parameters:
  
  features - (undocumented)
  
  Returns:
  
  vector where element i is the raw prediction for label i. This raw prediction may be any real number, where a larger value indicates greater confidence for that label.
- copy
  
  public RandomForestClassificationModel copy(ParamMap extra)
  
  Description copied from interface: Params
  
  Creates a copy of this instance with the same UID and some extra params. Subclasses should implement this method and set the return type properly. See defaultCopy().
  
  Specified by:
  
  copy in interface Params
  
  Specified by:
  
  copy in class Model<RandomForestClassificationModel>
  
  Parameters:
  
  extra - (undocumented)
  
  Returns:
  
  (undocumented)
- toString
  
  public String toString()
  
  Description copied from interface: TreeEnsembleModel
  
  Summary of the model
  
  Specified by:
  
  toString in interface Identifiable
  
  Specified by:
  
  toString in interface TreeEnsembleModel<DecisionTreeClassificationModel>
  
  Overrides:
  
  toString in class Object
- featureImportances
  
  public Vector featureImportances()
- write
  
  public MLWriter write()
  
  Description copied from interface: MLWritable
  
  Returns an MLWriter instance for this ML instance.
  
  Specified by:
  
  write in interface MLWritable
  
  Returns:
  
  (undocumented)

Class RandomForestClassificationModel

Nested Class Summary

Nested classes/interfaces inherited from interface org.apache.spark.internal.Logging

Method Summary

Methods inherited from class org.apache.spark.ml.classification.ProbabilisticClassificationModel

Methods inherited from class org.apache.spark.ml.classification.ClassificationModel

Methods inherited from class org.apache.spark.ml.PredictionModel

Methods inherited from class org.apache.spark.ml.Model

Methods inherited from class org.apache.spark.ml.Transformer

Methods inherited from class org.apache.spark.ml.PipelineStage

Methods inherited from class java.lang.Object

Methods inherited from interface org.apache.spark.ml.tree.DecisionTreeParams

Methods inherited from interface org.apache.spark.ml.param.shared.HasCheckpointInterval

Methods inherited from interface org.apache.spark.ml.param.shared.HasFeaturesCol

Methods inherited from interface org.apache.spark.ml.param.shared.HasLabelCol

Methods inherited from interface org.apache.spark.ml.param.shared.HasPredictionCol

Methods inherited from interface org.apache.spark.ml.param.shared.HasProbabilityCol

Methods inherited from interface org.apache.spark.ml.param.shared.HasRawPredictionCol

Methods inherited from interface org.apache.spark.ml.param.shared.HasSeed

Methods inherited from interface org.apache.spark.ml.param.shared.HasThresholds

Methods inherited from interface org.apache.spark.ml.util.HasTrainingSummary

Methods inherited from interface org.apache.spark.ml.param.shared.HasWeightCol

Methods inherited from interface org.apache.spark.internal.Logging

Methods inherited from interface org.apache.spark.ml.util.MLWritable

Methods inherited from interface org.apache.spark.ml.param.Params

Methods inherited from interface org.apache.spark.ml.tree.RandomForestParams

Methods inherited from interface org.apache.spark.ml.tree.TreeClassifierParams

Methods inherited from interface org.apache.spark.ml.tree.TreeEnsembleClassifierParams

Methods inherited from interface org.apache.spark.ml.tree.TreeEnsembleModel

Methods inherited from interface org.apache.spark.ml.tree.TreeEnsembleParams

Method Details

read

load

totalNumNodes

impurity

numTrees

bootstrap

subsamplingRate

featureSubsetStrategy

leafCol

maxDepth

maxBins

minInstancesPerNode

minWeightFractionPerNode

minInfoGain

maxMemoryInMB

cacheNodeIds

weightCol

seed

checkpointInterval

uid

numFeatures

numClasses

estimatedSize

trees

treeWeights

summary

binarySummary

evaluate

transformSchema

transform

predictRaw

copy

toString

featureImportances

write