Source code for koho.sklearn._decision_forest

# encoding=utf-8
""" Decision Forest module.

- Classification
- n Decision Trees with soft voting
- Important Features

Python interface compatible with scikit-learn.
"""

# Author: AI Werkstatt (TM)
# (C) Copyright 2019, AI Werkstatt (TM) www.aiwerkstatt.com. All rights reserved.

# Scikit-learn compatible
# http://scikit-learn.org/stable/developers
# Trying to be consistent with scikit-learn's ensemble module
# https://github.com/scikit-learn/scikit-learn
# Basic concepts for the implementation of the classifier are based on
# G. Louppe, “Understanding Random Forests”, PhD Thesis, 2014

import numbers
import numpy as np
import operator
from functools import reduce
from warnings import warn
from sklearn.base import BaseEstimator, ClassifierMixin, MultiOutputMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, check_consistent_length
from sklearn.utils.multiclass import unique_labels
from sklearn.utils._joblib import Parallel, delayed

from ._decision_tree import DecisionTreeClassifier
# Cython binding for C++ implementation
from ._decision_tree_cpp import RandomState

# ==============================================================================
# Decision Forest Classifier
# ==============================================================================


def _DecisionForestClassifier_bagging_fit_and_oob_score(estimator, oob_score, X, y, n_samples, n_outputs, n_classes, data_seed):
    """ Parallel processing helper function for DecisionForestClassifier's fit function.
    """

    # Build a decision tree from the bootstrapped training data
    # drawing random samples with replacement
    random_state = RandomState(data_seed)
    idx = random_state.randint(0, n_samples, size=n_samples)  # includes 0, excludes n_samples
    # make sure training data includes all classes across all outputs
    cnt = 0
    while True:
        # check
        all = True
        for o in range(n_outputs):
            if np.unique(y[idx, o]).shape[0] < n_classes[o]:
                all = False
                break
        if all: break
        # redraw samples
        idx = random_state.randint(0, n_samples, size=n_samples)
        # unable to randomize training data while including all classes
        cnt += 1
        if cnt > 10000:
            raise ValueError("Unable to randomize training data including all classes for bagging.")

    estimator.fit(X[idx, :], y[idx])

    # Compute Out-Of-Bag estimates
    # as average error for all samples across all outputs when not included in bootstrap

    # We use n_classes_max to create a nice 3D array to hold the predicted values x samples x classes
    # as the number of classes can be different for different outputs
    n_classes_max = int(max(n_classes))
    p = np.zeros((n_samples, n_outputs, n_classes_max))

    if oob_score:
        unsampled_idx = np.bincount(idx, minlength=n_samples) == 0
        if sum(unsampled_idx) > 0:
            proba = estimator.predict_proba(X[unsampled_idx, :])
            p[unsampled_idx, :] = np.reshape(proba, (-1, n_outputs, n_classes_max))

    return estimator, p


[docs]class DecisionForestClassifier(BaseEstimator, ClassifierMixin, MultiOutputMixin):
    """ A decision forest classifier.

    Parameters
    ----------
    n_estimators : int, optional (default=100)
        The number of decision trees in the forest.

    bootstrap : boolean, optional (default=False)
        Whether bootstrap samples are used when building trees.
        Out-of-bag samples are used to estimate the generalization accuracy.

    oob_score : bool, optional (default=False)
        Whether to use out-of-bag samples to estimate
        the generalization accuracy.

    class_balance : str 'balanced' or None, optional (default='balanced')
        Weighting of the classes.

            - If 'balanced', then the values of y are used to automatically adjust class weights
              inversely proportional to class frequencies in the input data.
            - If None, all classes are supposed to have weight one.

    max_depth : int or None, optional (default=3)
        The maximum depth of the tree.

            - If None, the depth of the tree is expanded until all leaves
              are pure or no further impurity improvement can be achieved.

    max_features : int, float, str or None, optional (default=None)
        The number of random features to consider when looking for the best split at each node.

            - If int, then consider ``max_features`` features.
            - If float, then ``max_features`` is a percentage and
              int(``max_features`` * n_features) features are considered.
            - If 'auto', then ``max_features`` = sqrt(n_features).
            - If 'sqrt', then ``max_features`` = sqrt(n_features).
            - If 'log2', then ``max_features`` = log2(n_features).
            - If None, then ``max_features`` = n_features considering all features in random order.

        Note: the search for a split does not stop until at least
        one valid partition of the node samples is found up to the point that
        all features have been considered,
        even if it requires to effectively inspect more than ``max_features`` features.

        `Decision Tree`: ``max_features`` = None and ``max_thresholds`` = None

        `Random Tree`: ``max_features`` < n_features and ``max_thresholds`` = None

    max_thresholds : int 1 or None, optional (default=None)
        The number of random thresholds to consider when looking for the best split at each node.

            - If 1, then consider 1 random threshold, based on the `Extreme Randomized Tree` formulation.
            - If None, then all thresholds, based on the mid-point of the node samples, are considered.

        `Extreme Randomized Trees (ET)`: ``max_thresholds`` = 1

        `Totally Randomized Trees`: ``max_features`` = 1 and ``max_thresholds`` = 1,
        very similar to `Perfect Random Trees (PERT)`.

    missing_values : str 'NMAR' or None, optional (default=None)
        Handling of missing values, defined as np.NaN.

            - If 'NMAR' (Not Missing At Random), then during training: the split criterion considers missing values
              as another category and samples with missing values are passed to either the left or the right child
              depending on which option provides the best split,
              and then during testing: if the split criterion includes missing values,
              a missing value is dealt with accordingly (passed to left or right child),
              or if the split criterion does not include missing values,
              a missing value at a split criterion is dealt with by combining the results from both children
              proportionally to the number of samples that are passed to the children during training.
            - If None, an error is raised if one of the features has a missing value.
              An option is to use imputation (fill-in) of missing values prior to using the decision tree classifier.

    random_state : int or None, optional (default=None)
        A random state to control the pseudo number generation and repetitiveness of fit().

            - If int, random_state is the seed used by the random number generator;
            - If None, the random number generator is seeded with the current system time.

    n_jobs : int, optional (default=None)
        The number of jobs to run in parallel for both `fit` and `predict`.

            - None means 1.
            - If -1, then the number of jobs is set to the number of cores.

    Attributes
    ----------
    n_outputs_ : int
        The number of outputs (multi-output).

    classes_ : list of variable size arrays, shape = [n_classes for each output]
        The classes labels for each output.

    n_classes_ : list of int
        The number of classes for each output.

    n_features_ : int
        The number of features.

    estimators_ : list of tree objects from DecisionTreeClassifier
        The collection of the underlying sub-estimators.

    feature_importances_ : array, shape = [n_features]
        The feature importances. The higher, the more important the
        feature. The importance of a feature is computed as the (normalized)
        total reduction of the criterion brought by that feature.

    oob_score_ : float
        Score of the training dataset obtained using an out-of-bag estimate.
    """

    # We use 'class_balance' as the hyperparameter name instead of “class_weight”
    # The “class_weight” hyperparameter name is recognized by 'check_estimator()'
    # and the test “check_class_weight_ classifiers()” is performed that uses the
    # dict parameter and requires for a decision tree the “min_weight_fraction_leaf”
    # hyperparameter to be implemented to pass the test.

[docs]    def __init__(self,
                 n_estimators=100,
                 bootstrap=False,
                 oob_score=False,
                 class_balance='balanced',  # pass through hyperparameter to decision trees
                 max_depth=3,  # pass through hyperparameter to decision trees
                 max_features='auto',
                 max_thresholds=None,
                 missing_values=None,
                 random_state=None,
                 n_jobs=None):
        """ Create a new decision forest classifier and initialize it with hyperparameters.
        """
        # Hyperparameters
        self.n_estimators = n_estimators
        self.bootstrap = bootstrap
        self.oob_score = oob_score
        self.class_balance = class_balance
        self.max_depth = max_depth
        self.max_features = max_features
        self.max_thresholds = max_thresholds
        self.missing_values = missing_values
        # Random Number Generator
        self.random_state = random_state
        # Parallel Processing
        self.n_jobs = n_jobs

        return

[docs]    def fit(self, X, y):
        """ Build a decision forest classifier based on decision tree classifiers from the training data.

        Parameters
        ----------
        X : array, shape = [n_samples, n_features]
            The training input samples.

        y : array, shape = [n_samples] or [n_samples, n_outputs]
            The target class labels corresponding to the training input samples.

        Returns
        -------
        self : object
            Returns self.
        """

        # Check and prepare data
        # ----------------------

        # Check X, y

        if self.missing_values == 'NMAR':
            X, y = check_X_y(X, y, dtype=np.float64, order="C", force_all_finite='allow-nan', multi_output=True)
        else:
            X, y = check_X_y(X, y, dtype=np.float64, order="C", multi_output=True)

        n_samples, self.n_features_ = X.shape

        # Handle multi-outputs
        if y.ndim == 1:  # 2D format for single-output and multi-output
            y = np.reshape(y, (-1, 1))
        self.n_outputs_ = y.shape[1]

        if y.shape[0] != n_samples:
            raise ValueError("Mismatch: n_outputs, n_features and n_samples.")

        self.classes_ = []  # lists with an element for each output
        self.n_classes_ = np.zeros(self.n_outputs_, dtype=np.uint)
        y_int = np.zeros(y.shape, dtype=np.int)  # make sure y is integer
        for o in range(self.n_outputs_):  # process each output independently
            o_classes = unique_labels(y[:, o])  # Keep to raise required ValueError tested by 'check_estimator()'
            o_classes, y_int[:, o] = np.unique(y[:, o], return_inverse=True)  # Encode y from classes to integers
            self.classes_.append(o_classes)
            self.n_classes_[o] = o_classes.shape[0]
        if self.n_outputs_ == 1:
            self.classes_ = reduce(operator.concat, self.classes_)

        # Check hyperparameters (here, not in __init__)

        # n estimators

        if not isinstance(self.n_estimators, (numbers.Integral, np.integer)):
            raise TypeError("n_estimators: must be an integer.")

        if self.n_estimators < 1:
            raise ValueError("n_estimators: %s < 1, "
                             "but a decsion forest requires to have at least 1 decision tree."
                             % self.n_estimators)

        # bootstrap

        if not isinstance(self.bootstrap, bool):
            raise TypeError("bootstrap: must be boolean.")

        # oob score

        if not isinstance(self.oob_score, bool):
            raise TypeError("oob_score: must be boolean.")

        if not self.bootstrap and self.oob_score:
            raise ValueError("oob_score: only available if bootstrap=True")

        # Random Number Generator

        random_state = RandomState(self.random_state)

        # Create explicitly different seeds for the decision trees
        # to avoid building the same tree over and over again for the entire decision forest
        # when decision trees are build in parallel.
        algo_seeds = random_state.randint(0, random_state.MAX_INT, size=self.n_estimators)

        # Build a decision forest
        # -----------------------

        # Instantiate decision trees
        estimators = []
        for e in range(self.n_estimators):
            estimator = DecisionTreeClassifier(class_balance=self.class_balance,
                                               max_depth=self.max_depth,
                                               max_features=self.max_features,
                                               max_thresholds=self.max_thresholds,
                                               missing_values=self.missing_values,
                                               random_state=algo_seeds[e])
            estimators.append(estimator)

        # Build decision trees from the training data
        if not self.bootstrap:

            # embarrassing parallelism

            # for estimator in estimators:
            #     estimator.fit(X, y)

            estimators = Parallel(n_jobs=self.n_jobs) \
                (delayed(estimator.fit)(X, y) for estimator in estimators)

        else:  # Bagging & Out_Of-Bag estimate

            # Different seeds for algorithm and data (bagging)
            # to avoid building the same trees multiple times
            # when the same seed comes up again.
            data_seeds = random_state.randint(0, random_state.MAX_INT, size=self.n_estimators)

            # embarrassing parallelism
            ps = []

            out = Parallel(n_jobs=self.n_jobs) \
                (delayed(_DecisionForestClassifier_bagging_fit_and_oob_score)  # helper function
                 (estimator, self.oob_score, X, y, n_samples, self.n_outputs_, self.n_classes_, data_seed)
                 for estimator, data_seed in zip(estimators, data_seeds))
            estimators, ps = zip(*out)

            # Predict classes probabilities for all outputs for the decision forest
            # as average of the class probabilities from all decision trees
            # >>> reduce

            if self.oob_score:
                ps = np.array(ps)
                class_probabilities = np.sum(ps, axis=0)  # no normalization needed when using argmax( ) later on
                predictions = np.argmax(class_probabilities, axis=2)
                valid_idx = np.sum(np.sum(class_probabilities, axis=2), axis=1) > 0.0  # samples that have an oob score
                oob_n_samples = np.sum(valid_idx)
                if oob_n_samples == n_samples:
                    self.oob_score_ = np.mean(y[valid_idx] == predictions[valid_idx])
                else:
                    self.oob_score_ = 0.0
                    warn("Only %s out of %s samples have an out-of-bag estimate. "
                         "This probably means too few estimators were used "
                         "to compute any reliable oob estimates."
                         % (oob_n_samples, n_samples))

        self.estimators_ = estimators

        # Return the classifier
        return self

[docs]    def predict(self, X):
        """ Predict classes for the test data using soft voting.

        Parameters
        ----------
        X : array, shape = [n_samples, n_features]
            The test input samples.

        Returns
        -------
        y : array, shape = [n_samples] or [n_samples, n_outputs]
            The predicted classes for the test input samples.
        """

        # Check that fit has been called
        check_is_fitted(self, ['estimators_'])

        # Check X
        if self.missing_values == 'NMAR':
            X = check_array(X, dtype=np.float64, order="C", force_all_finite='allow-nan')
        else:
            X = check_array(X, dtype=np.float64, order="C")

        n_samples = X.shape[0]
        n_classes_max = max(self.n_classes_)

        # Soft voting using
        # predicted class probabilities
        class_probablities = self.predict_proba(X)
        # Handle single-output and multi-outputs formatting
        # 2D format for single-output and multi-output
        class_probablities = np.reshape(class_probablities, (-1, self.n_outputs_, n_classes_max))

        # Handle multi-outputs formatting
        y = []
        if self.n_outputs_ == 1:
            # Determine class based on highest classes probabilities
            predictions = np.argmax(class_probablities[:, 0], axis=1)
            # Decode y back from integers to classes
            y = self.classes_.take(predictions, axis=0)
        else:
            for o in range(self.n_outputs_):
                # Determine class based on highest classes probabilities
                predictions = np.argmax(class_probablities[:, o], axis=1)
                # Decode y back from integers to classes
                y.append(self.classes_[o].take(predictions, axis=0))
            y = np.array(y)
            y = np.reshape(y.transpose(), (-1, self.n_outputs_)) # 2D format for multi-output

        return y

[docs]    def predict_proba(self, X):
        """ Predict classes probabilities for the test data.

        Parameters
        ----------
        X : array, shape = [n_samples, n_features]
            The test input samples.

        Returns
        -------
        p : array, shape = [n_samples, n_classes]
            The predicted classes probablities for the test input samples.
        """

        # Check that fit has been called
        check_is_fitted(self, ['estimators_'])

        # Check X
        if self.missing_values == 'NMAR':
            X = check_array(X, dtype=np.float64, order="C", force_all_finite='allow-nan')
        else:
            X = check_array(X, dtype=np.float64, order="C")

        n_features = X.shape[1]
        if self.n_features_ != n_features:
            raise ValueError("X: number of features %s != number of features of the model %s, "
                             "must match."
                             % (n_features, self.n_features_))

        # Predict class probabilities for all decision trees

        # embarrassing parallelism
        ps = []

        # for estimator in self.estimators_:
        #     p = estimator.predict_proba(X)
        #     ps.append(p)

        ps = Parallel(n_jobs=self.n_jobs)\
            (delayed(estimator.predict_proba)(X) for estimator in self.estimators_)

        # Predict classes probabilities for the decision forest
        # as average of the class probabilities from all decision trees

        proba = sum(ps) / len(self.estimators_)  # reduce

        # Handle single-output and multi-outputs formatting
        n_classes_max = max(self.n_classes_)
        if self.n_outputs_ == 1:
            proba = np.reshape(proba, (-1, self.n_classes_[0]))
        else:
            proba = np.reshape(proba, (-1, self.n_outputs_, n_classes_max))

        return proba


[docs]    def score(self, X, y):
        """Returns the mean accuracy on the given test data and labels.

        sklearn has no metrics support for "multiclass-multioutput" format,
        therefore we implement our own score() here

        In multi-label classification, this is the subset accuracy
        which is a harsh metric since you require for each sample that
        each label set be correctly predicted.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Test samples.

        y : array-like, shape = (n_samples) or (n_samples, n_outputs)
            True labels for X.

        Returns
        -------
        score : float
            Mean accuracy of self.predict(X) wrt. y.

        """

        y_pred = self.predict(X)

        # Handle single-output and multi-outputs formatting
        y = y.ravel()
        y_pred = y_pred.ravel()

        # No metrics support "multiclass-multioutput" format
        # y_type, y, y_pred = _check_targets(y, y_pred)
        check_consistent_length(y, y_pred)

        score = np.average(y == y_pred)

        return score

    @property
    def feature_importances_(self):
        """ Get feature importances from the decision forest.
        """

        # Check that fit has been called
        check_is_fitted(self, ['estimators_'])

        # Calculate feature importances for all decision trees

        # embarrassing parallelism
        fis = []

        # for estimator in self.estimators_:
        #     fi = estimator.feature_importances_
        #     fis.append(fi)

        fis = Parallel(n_jobs=self.n_jobs) \
            (delayed(getattr)(estimator, 'feature_importances_') for estimator in self.estimators_)

        # Calculate feature importances for the decision forest
        # as average of feature importances from all decision trees
        return sum(fis) / len(self.estimators_)  # reduce