Skip to content

decision_tree

DecisionTreeClassifier ⚓︎

Decision Tree Classifier

Simple decision tree classifier with user specific impurity, max depth and minimum number of samples in leaf nodes.

Parameters:

Name Type Description Default
criterion str, optional

The impurity criterion to use when splitting nodes, by default 'gini'

'gini'
max_depth int, optional

The maximum depth of the decision tree, by default 100

100
min_samples_in_leaf int, optional

The minimum number of samples that need to be in a leaf, by default 2

2
Source code in mlproject/decision_tree/_decision_tree.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
class DecisionTreeClassifier:
    """Decision Tree Classifier

    Simple decision tree classifier with user specific impurity, max depth and
    minimum number of samples in leaf nodes.

    Parameters
    ----------
    criterion : str, optional
        The impurity criterion to use when splitting nodes, by default 'gini'
    max_depth : int, optional
        The maximum depth of the decision tree, by default 100
    min_samples_in_leaf : int, optional
        The minimum number of samples that need to be in a leaf, by default 2
    """

    def __init__(self, criterion="gini", max_depth=100, min_samples_in_leaf=2):

        self.max_depth = max_depth
        self.min_samples_in_leaf = min_samples_in_leaf
        self.root = None

        if criterion.lower() == "gini":
            self.criterion = gini_impurity
        elif criterion.lower() == "entropy":
            self.criterion = entropy_impurity

    def fit(self, X, y):
        """Fit the decision tree to the given data

        Parameters
        ----------
        X : 2d ndarray
            The data to be used for fitting the decision tree
        y : 2d ndarray
            An array of the true labels for the data points
        """

        # for use in calculating the _most_common_label to get counts even if class not present in leaf
        self.classes = np.unique(y)

        with progress as pb:
            t1 = pb.add_task("[blue]Training", total=1)

            self.root = self._grow(X, y)
            pb.update(t1, advance=1)
            if progress.finished:
                pb.update(t1, description="[bright_green]Training complete!")

    def predict(self, X):
        """Predict class labels for the given data.

        For all data points in the dataset traverse the decision tree until it reaches a leaf node
        and return the majority class of that leaf node.

        Parameters
        ----------
        X : 2d ndarray
            The data that we want to use to make predictions.

        Returns
        -------
        1d ndarray
            All predicted class labels with size n, where n is the number of data points.
        """
        return np.array(
            [self._traverse(datapoint, self.root, prob=False) for datapoint in X]
        )

    def predict_proba(self, X):
        """Predict class probabilities for the given data

        For all data points in the dataset traverse the decision tree until it reaches a leaf node
        and return the class probabilities of that leaf node.

        Parameters
        ----------
        X : 2d ndarray
            The data that we want to use to make predictions

        Returns
        -------
        2d ndarray
            All probabilites with size n x k, where n is the number of data points and k is the number classes
        """
        return np.array(
            [self._traverse(datapoint, self.root, prob=True) for datapoint in X]
        )

    def _grow(self, X, y, cur_depth=0):
        """Grows a decision tree from the given data.
        This is the part that is doing the actual fitting of the decision tree.

        Parameters
        ----------
        X : 2d ndarray
            The data to use when growing the decision tree
        y : 2d ndarray
            array of the true class labels
        cur_depth : int, optional
            The current depth of the decision tree, by default 0

        Returns
        -------
        Node
            A new node of class Node with new left and right children.
        """

        self.n, self.p = X.shape
        node_unique_classes = np.unique(y)
        self.node_k = len(node_unique_classes)

        if (
            cur_depth >= self.max_depth
            or self.n < self.min_samples_in_leaf
            or self.node_k == 1
        ):

            most_common = self._most_common_label(y, prob=False)
            class_probs = self._most_common_label(y, prob=True)
            return Node(majority_class=most_common, class_probs=class_probs)

        cur_depth += 1

        best_feature, best_threshold = self._best_split(X, y)

        left_idxs = np.argwhere(X[:, best_feature] <= best_threshold).flatten()
        right_idxs = np.argwhere(X[:, best_feature] > best_threshold).flatten()

        left = self._grow(X[left_idxs, :], y[left_idxs], cur_depth)
        right = self._grow(X[right_idxs, :], y[right_idxs], cur_depth)

        return Node(left, right, best_feature, best_threshold)

    def _best_split(self, X, y):
        """Calculates the best split of a node with the given data points

        Parameters
        ----------
        X : 2d ndarray
            The data points to consider for splitting this node
        y : 2d ndarray
            The true labels to consider for splitting this node

        Returns
        -------
        tuple
            A tuple containing the best index and threshold for the split
        """
        best_gain = -np.inf

        for feat_idx in range(X.shape[1]):
            feature_col = X[:, feat_idx]
            possible_splits = np.unique(feature_col)

            for split in possible_splits:
                cur_gain = self._information_gain(y, feature_col, split)

                if cur_gain > best_gain:
                    best_gain = cur_gain
                    split_idx = feat_idx
                    split_thresh = split

        return split_idx, split_thresh

    def _information_gain(self, y, feature_col, split_thresh):
        """Calculates the information gain of a node with the given data labels

        Parameters
        ----------
        y : 2d ndarray
            array of true labels for this node
        feature_col : 2d ndarray
            Column of dataset containing the data points of the best feature for this split
        split_thresh : float or int
            the threshold for the best split of the data

        Returns
        -------
        float
            The information gain from this node compared to it's parent
        """

        parent_impurity = self.criterion(y)

        left_idxs = np.argwhere(feature_col <= split_thresh).flatten()
        right_idxs = np.argwhere(feature_col > split_thresh).flatten()

        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0

        n = len(y)
        left_prob = len(left_idxs) / n
        right_prob = len(right_idxs) / n

        left_impurity = self.criterion(y[left_idxs])
        right_impurity = self.criterion(y[right_idxs])

        weighted_impurity = left_prob * left_impurity + right_prob * right_impurity

        information_gain = parent_impurity - weighted_impurity
        return information_gain

    def _traverse(self, X, node, prob=False):
        """Traverses the tree until it reaches a leaf node and returns either the majority
        class of that node or the class probabilities if prob is True.

        Parameters
        ----------
        X : 2d ndarray
            The data points to use for traversing the tree
        node : Node
            The node to start the traversal from.
        prob : bool, optional
            used to specify whether or not to return class probabilities, by default False

        Returns
        -------
        int or bool
            Depending on `prob` it either returns the majority class or the class probabilities
        """

        if node.is_leaf():
            if prob:
                return node.class_probs
            return node.majority_class

        if X[node.feature] <= node.threshold:
            return self._traverse(X, node.left, prob)

        elif X[node.feature] > node.threshold:
            return self._traverse(X, node.right, prob)

    def _most_common_label(self, y, prob=False):
        """Calculates the most common label of a leaf node or the class probabilities

        Parameters
        ----------
        y : 2d ndarray
            Array of true labels for this particular node
        prob : bool, optional
            used to specify whether or not to return class probabilities, by default False

        Returns
        -------
        int or bool
            Depending on `prob` it either returns the majority class or the class probabilities
        """
        # flatten y because np.bincount() expects a 1d array
        y = y.flatten()
        counts = np.bincount(y, minlength=len(self.classes))
        n = np.sum(counts)

        if prob:
            return counts / n

        return np.argmax(counts)

fit(X, y) ⚓︎

Fit the decision tree to the given data

Parameters:

Name Type Description Default
X 2d ndarray

The data to be used for fitting the decision tree

required
y 2d ndarray

An array of the true labels for the data points

required
Source code in mlproject/decision_tree/_decision_tree.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def fit(self, X, y):
    """Fit the decision tree to the given data

    Parameters
    ----------
    X : 2d ndarray
        The data to be used for fitting the decision tree
    y : 2d ndarray
        An array of the true labels for the data points
    """

    # for use in calculating the _most_common_label to get counts even if class not present in leaf
    self.classes = np.unique(y)

    with progress as pb:
        t1 = pb.add_task("[blue]Training", total=1)

        self.root = self._grow(X, y)
        pb.update(t1, advance=1)
        if progress.finished:
            pb.update(t1, description="[bright_green]Training complete!")

predict(X) ⚓︎

Predict class labels for the given data.

For all data points in the dataset traverse the decision tree until it reaches a leaf node and return the majority class of that leaf node.

Parameters:

Name Type Description Default
X 2d ndarray

The data that we want to use to make predictions.

required

Returns:

Type Description
1d ndarray

All predicted class labels with size n, where n is the number of data points.

Source code in mlproject/decision_tree/_decision_tree.py
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def predict(self, X):
    """Predict class labels for the given data.

    For all data points in the dataset traverse the decision tree until it reaches a leaf node
    and return the majority class of that leaf node.

    Parameters
    ----------
    X : 2d ndarray
        The data that we want to use to make predictions.

    Returns
    -------
    1d ndarray
        All predicted class labels with size n, where n is the number of data points.
    """
    return np.array(
        [self._traverse(datapoint, self.root, prob=False) for datapoint in X]
    )

predict_proba(X) ⚓︎

Predict class probabilities for the given data

For all data points in the dataset traverse the decision tree until it reaches a leaf node and return the class probabilities of that leaf node.

Parameters:

Name Type Description Default
X 2d ndarray

The data that we want to use to make predictions

required

Returns:

Type Description
2d ndarray

All probabilites with size n x k, where n is the number of data points and k is the number classes

Source code in mlproject/decision_tree/_decision_tree.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def predict_proba(self, X):
    """Predict class probabilities for the given data

    For all data points in the dataset traverse the decision tree until it reaches a leaf node
    and return the class probabilities of that leaf node.

    Parameters
    ----------
    X : 2d ndarray
        The data that we want to use to make predictions

    Returns
    -------
    2d ndarray
        All probabilites with size n x k, where n is the number of data points and k is the number classes
    """
    return np.array(
        [self._traverse(datapoint, self.root, prob=True) for datapoint in X]
    )

Node ⚓︎

Node object for building a decision tree.

Parameters:

Name Type Description Default
feature int index, optional

index of the best feature for splitting this Node, by default None

None
threshold float, optional

the threshold for the best split of the data, by default None

None
left Node, optional

the left child of this Node also of class Node, by default None

None
right Node, optional

the right child of this Node also of class Node, by default None

None
majority_class int, optional

The majority class in this node, only if this Node is a leaf, by default None

None
class_probs 1d ndarray, optional

An array of class probabilities for this node, only if this Node is a leaf, by default None

None
Source code in mlproject/decision_tree/_node.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
class Node:
    """Node object for building a decision tree.

    Parameters
    ----------
    feature : int index, optional
        index of the best feature for splitting this Node, by default None
    threshold : float, optional
        the threshold for the best split of the data, by default None
    left : Node, optional
        the left child of this Node also of class Node, by default None
    right : Node, optional
        the right child of this Node also of class Node, by default None
    majority_class : int, optional
        The majority class in this node, only if this Node is a leaf, by default None
    class_probs : 1d ndarray, optional
        An array of class probabilities for this node, only if this Node is a leaf, by default None
    """

    def __init__(
        self,
        left=None,
        right=None,
        feature=None,
        threshold=None,
        *,
        majority_class=None,
        class_probs=None
    ):
        self.feature = feature
        self.threshold = threshold
        self.left, self.right = left, right
        self.majority_class = majority_class
        self.class_probs = class_probs

    def is_leaf(self):
        """Returns True if this Node is a leaf node, otherwise False

        Returns
        -------
        bool
            True if this Node is a leaf node, otherwise False
        """

        return self.majority_class is not None

is_leaf() ⚓︎

Returns True if this Node is a leaf node, otherwise False

Returns:

Type Description
bool

True if this Node is a leaf node, otherwise False

Source code in mlproject/decision_tree/_node.py
39
40
41
42
43
44
45
46
47
48
def is_leaf(self):
    """Returns True if this Node is a leaf node, otherwise False

    Returns
    -------
    bool
        True if this Node is a leaf node, otherwise False
    """

    return self.majority_class is not None