Класификационни и регресионни дървета

In [1]:
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

ДнСс: decision trees

ΠŸΡ€Π΅Π΄ΠΈ Ρ‚ΠΎΠ²Π°: Π΄Π° си ΠΏΡ€ΠΈΠΏΠΎΠΌΠ½ΠΈΠΌ ΠΌΠ°Π»ΠΊΠΎ ΠΎΡ‚ прСдния ΠΏΡŠΡ‚.

NumPy и векторизация

ΠŸΠΎΠ²Π΅Ρ‡Π΅Ρ‚ΠΎ ΠΎΠΏΠ΅Ρ€Π°Ρ†ΠΈΠΈ Π² NumPy работят с Ρ‚Π°ΠΊΠ° Π½Π°Ρ€Π΅Ρ‡Π΅Π½Π°Ρ‚Π° вСкторизация. Π’ΠΎΠ²Π° ΠΏΠΎ-лСсно сС ΠΈΠ»ΡŽΡΡ‚Ρ€ΠΈΡ€Π° с ΠΏΡ€ΠΈΠΌΠ΅Ρ€:

In [2]:
np.array([1, 2, 3, 4]) * 5
Out[2]:
array([ 5, 10, 15, 20])
In [3]:
np.array([1, 2, 3]) * np.array([3, 4, 5])
Out[3]:
array([ 3,  8, 15])
In [4]:
np.array([8, 3, 4, 1, 9, 4]) > 4
Out[4]:
array([ True, False, False, False,  True, False], dtype=bool)
In [5]:
(np.array([8, 3, 4, 1, 9, 4]) > 4).astype(float)
Out[5]:
array([ 1.,  0.,  0.,  0.,  1.,  0.])
In [6]:
np.array([10, 20, 30])[np.array([1, 0, 2, 0, 1])]
Out[6]:
array([20, 10, 30, 10, 20])

ΠžΡΠ½ΠΎΠ²Π½Π°Ρ‚Π° Ρ†Π΅Π» Π΅ Π΄Π° Π½Π΅ Π½ΠΈ сС Π½Π°Π»Π°Π³Π° Π΄Π° пишСм for Ρ†ΠΈΠΊΠ»ΠΈ. ΠšΠ°Ρ‚ΠΎ допълнСниС – Π²Π΅ΠΊΡ‚ΠΎΡ€ΠΈΠ·ΠΈΡ€Π°Π½ΠΈΡ‚Π΅ ΠΎΠΏΠ΅Ρ€Π°Ρ†ΠΈΠΈ Π² NumPy работят ΠΌΠ½ΠΎΠ³ΠΎ ΠΏΠΎ-Π±ΡŠΡ€Π·ΠΎ ΠΎΡ‚ Ρ†ΠΈΠΊΡŠΠ»Π° Π² Python, ΠΊΠΎΠΉΡ‚ΠΎ Π±ΠΈΡ…ΠΌΠ΅ написали.

LabelEncoder

Ако ΠΈΠΌΠ° ΠΊΠ°Ρ‚Π΅Π³ΠΎΡ€ΠΈΠΉΠ½ΠΈ Π΄Π°Π½Π½ΠΈ (Π½Π°ΠΏΡ€ΠΈΠΌΠ΅Ρ€ Π½ΠΈΠ·ΠΎΠ²Π΅), ΠΌΠΎΠΆΠ΅ Π΄Π° ΠΏΠΎΠ»Π·Π²Π°ΠΌΠ΅ LabelEncoder Π΄Π° Π³ΠΈ Π·Π°ΠΌΠ΅Π½ΠΈΠΌ с числа:

In [7]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(["red", "green", "red", "blue", "red", "green"])

colors = ["green", "green", "blue"]

print("transofrmed:", encoder.transform(["green", "green", "blue"])) 
print("inverse:    ", encoder.inverse_transform([0, 1, 2]))
transofrmed: [1 1 0]
inverse:     ['blue' 'green' 'red']

OneHotEncoder

МоТС Π΄Π° ΠΊΠΎΠ΄ΠΈΡ€Π°ΠΌΠ΅ ΠΊΠ°Ρ‚Π΅Π³ΠΎΡ€ΠΈΠΈ с label encoder, ΠΊΠΎΠ³Π°Ρ‚ΠΎ Π² ΠΊΠ°Ρ‚Π΅Π³ΠΎΡ€ΠΈΠΈΡ‚Π΅ ΠΈΠΌΠ° някакъв СстСствСн Ρ€Π΅Π΄ (Π½Π°ΠΏΡ€. 4 Π΅ ΠΏΠΎ-голямо ΠΎΡ‚ 2). Ако няма Ρ‚Π°ΠΊΡŠΠ² Ρ€Π΅Π΄ ΠΎΠ±Π°Ρ‡Π΅, Π½Π° Π½Π°ΠΉ-Π΄ΠΎΠ±Ρ€Π΅ Π΅ Π΄Π° ΠΏΠΎΠ»Π·Π²Π°ΠΌΠ΅ one-hot – Ρ‚Π°ΠΊΠ° създавамС ΠΏΠΎ Π΅Π΄ΠΈΠ½ Ρ„ΠΈΠΉΡ‡ΡŠΡ€ Π·Π° всяка катСгория, ΠΊΠΎΠΉΡ‚ΠΎ ΠΈΠΌΠ° стойности 0 ΠΈΠ»ΠΈ 1.

In [8]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

encoder.fit([[0], 
             [1], 
             [0], 
             [2]])

print(encoder.transform([[0], [1], [1], [2], [0]]).toarray())
[[ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]
 [ 1.  0.  0.]]

Но Π΄Π° сС Π²ΡŠΡ€Π½Π΅ΠΌ към Π΄Π½Π΅ΡˆΠ½Π°Ρ‚Π° Ρ‚Π΅ΠΌΠ°!

НСка Π³Π΅Π½Π΅Ρ€ΠΈΡ€Π°ΠΌΠ΅ синтСтичСн dataset подходящ Π·Π° класификация. МоТС Π΄Π° ΠΏΠΎΠ³Π»Π΅Π΄Π½Π΅Ρ‚Π΅ докумСнтацията Π·Π° Π·Π½Π°Ρ‡Π΅Π½ΠΈΠ΅Ρ‚ΠΎ Π½Π° ΠΏΠ°Ρ€Π°ΠΌΠ΅Ρ‚Ρ€ΠΈΡ‚Π΅:

In [9]:
X, y = make_classification(n_samples=100,
                           n_features=2,
                           n_redundant=0, 
                           n_clusters_per_class=2, 
                           random_state=123)

X Ρ‰Π΅ ΡΡŠΠ΄ΡŠΡ€ΠΆΠ° ΠΌΠ°Ρ‚Ρ€ΠΈΡ†Π° ΠΎΡ‚ Ρ‚ΠΎΡ‡ΠΊΠΈ (всСки Ρ€Π΅Π΄ Π΅ Ρ‚ΠΎΡ‡ΠΊΠ° с Π΄Π²Π° ΠΊΠΎΠΎΡ€Π΄ΠΈΠ½Π°Ρ‚Π°), Π° y Ρ‰Π΅ ΡΡŠΠ΄ΡŠΡ€ΠΆΠ° класа Π½Π° всяка Ρ‚ΠΎΡ‡ΠΊΠ° (0 ΠΈΠ»ΠΈ 1):

In [10]:
print(X[:4])
print(y[:4])
[[-0.01032243 -0.80566819]
 [-1.10293659  2.21661117]
 [-1.90795358 -0.20839902]
 [ 0.53115524  2.2762704 ]]
[1 0 0 1]

НСка Π½Π°Ρ‡Π΅Ρ€Ρ‚Π°Π΅ΠΌ Ρ‚ΠΎΡ‡ΠΊΠΈΡ‚Π΅ ΠΈ Ρ‚Π΅Ρ…Π½ΠΈΡ‚Π΅ класовС Π² Ρ€Π°Π²Π½ΠΈΠ½Π°Ρ‚Π°:

In [11]:
plt.scatter(X[:, 0], X[:, 1], c=y);

Π’ΠΎΠ²Π° Π΅ Π΅Π΄Π½Π° функция, която Ρ‰Π΅ Π½Π°Ρ‡Π΅Ρ€Ρ‚Π°Π΅ decision boundary Π½Π° decision tree (Ρ‰Π΅ ΠΎΡ†Π²Π΅Ρ‚ΠΈ Ρ„ΠΎΠ½Π° спрямо ΠΊΠ°ΠΊ класификатора Ρ‰Π΅ ΠΎΠΏΡ€Π΅Π΄Π΅Π»ΠΈ Π΄Π°Π΄Π΅Π½Π° Ρ‚ΠΎΡ‡ΠΊΠ°):

In [12]:
# Plotting decision regions adapted from 
# http://scikit-learn.org/stable/auto_examples/ensemble/plot_voting_decision_regions.html

def plot_boundary(clf, X, y):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                         np.arange(y_min, y_max, 0.1))

    f, ax = plt.subplots(figsize=(10, 8))

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    ax.contourf(xx, yy, Z, alpha=0.4)
    ax.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor='k')
    
    plt.show()

НСка Ρ‚Ρ€Π΅Π½ΠΈΡ€Π°ΠΌΠ΅ класификатор:

In [13]:
classifier = DecisionTreeClassifier().fit(X, y)
print(classifier.score(X, y))
1.0

ΠšΠ°ΠΊΡ‚ΠΎ Π²ΠΈΠΆΠ΄Π°ΠΌΠ΅, Ρ‚ΠΎΠΉ успява Π΄Π° Π½Π°ΠΌΠ΅Ρ€ΠΈ Ρ€Π΅ΡˆΠ΅Π½ΠΈΠ΅, ΠΊΠΎΠ΅Ρ‚ΠΎ Π²ΠΈΠ½Π°Π³ΠΈ класифицира ΠΏΡ€Π°Π²ΠΈΠ»Π½ΠΎ. ВСроято ΠΏΡ€Π°Π²ΠΈ голям overfitting. Π”Π° Π½Π°Ρ‡Π΅Ρ€Ρ‚Π°Π΅ΠΌ decision boundary-Ρ‚ΠΎ:

In [14]:
plot_boundary(classifier, X, y)

ΠžΡ‚ Π³Ρ€Π°Ρ„ΠΈΠΊΠ°Ρ‚Π° ΠΈΠ·Π³Π»Π΅ΠΆΠ΄Π° Ρ‡Π΅ ΠΈΠΌΠ° overfitting – Π½Π°ΠΏΡ€ΠΈΠΌΠ΅Ρ€ ΠΈΠΌΠ° Π²Π΅Ρ€Ρ‚ΠΈΠΊΠ°Π»Π½Π°Ρ‚Π° ΠΈΠ²ΠΈΡ†Π° ΠΎΠΊΠΎΠ»Π° $x_0 = -1$ сС класифицира изцяло ΠΊΠ°Ρ‚ΠΎ ΠΆΡŠΠ»Ρ‚Π°, Π·Π°Ρ‰ΠΎΡ‚ΠΎ ΠΈΠΌΠ° Π΅Π΄Π½Π° Ρ‚ΠΎΡ‡ΠΊΠ° ΠΎΡ‚ Ρ‚ΠΎΠ·ΠΈ клас. По-вСроятно Π΅ Ρ‚ΠΎΠ²Π° Π΄Π° Π΅ някаква аномалия ΠΈ Π΄Π° Π±ΠΈΡ…ΠΌΠ΅ искали Π΄Π° я смСтнСм Π·Π° Ρ‚ΠΎΡ‡ΠΊΠ° ΠΎΡ‚ другия клас.

Π‘ΠΈΡ…ΠΌΠ΅ ΠΌΠΎΠ³Π»ΠΈ Π΄Π° рСгуляризирамС с min_samples_split:

In [15]:
classifier = DecisionTreeClassifier(min_samples_split=50).fit(X, y)
plot_boundary(classifier, X, y)

Π’ΡƒΠΊ ΠΏΠΎΠ»ΡƒΡ‡Π°Π²Π°ΠΌΠ΅ ΠΏΠΎ-прост ΠΌΠΎΠ΄Π΅Π», ΠΊΠΎΠΉΡ‚ΠΎ СстСствСно ΠΈΠ³Π½ΠΎΡ€ΠΈΡ€Π° някои Ρ‚ΠΎΡ‡ΠΊΠΈ, Π½ΠΎ ΠΏΠΎΠ½Π΅ Π½Π΅ overfit-Π²Π°.

НСка Π½Π°Ρ‡Π΅Ρ€Ρ‚Π°Π΅ΠΌ Π΄ΡŠΡ€Π²ΠΎΡ‚ΠΎ, ΠΊΠΎΠ΅Ρ‚ΠΎ Π°Π»Π³ΠΎΡ€ΠΈΡ‚ΡŠΠΌΠ° Π΅ ΠΎΡ‚ΠΊΡ€ΠΈΠ»:

In [16]:
import graphviz

# Π—Π° Π΄Π° ΠΈΠ·ΠΏΡŠΠ»Π½ΠΈΡ‚Π΅ Ρ‚ΠΎΠ·ΠΈ ΠΊΠΎΠ΄ Ρ‰Π΅ сС Π½ΡƒΠΆΠ΄Π°Π΅Ρ‚Π΅ ΠΎΡ‚ graphviz Π±ΠΈΠ±Π»ΠΈΠΎΡ‚Π΅ΠΊΠ°Ρ‚Π° ΠΈ ΡΡŠΠΎΡ‚Π²Π΅Ρ‚Π½ΠΈΡ ΠΏΠ°ΠΊΠ΅Ρ‚ Π·Π° нСя Π² python.
#
# На macOS Ρ‚ΠΎΠ²Π° става със:
#
#  brew install graphviz
#  pip install graphviz
#
# На Π΄Ρ€ΡƒΠ³ΠΈ ΠΎΠΏΠ΅Ρ€Π°Ρ†ΠΈΠΎΠ½Π½ΠΈ систСми трябва Π΄Π° Π·Π°ΠΌΠ΅Π½ΠΈΡ‚Π΅ ΠΏΡŠΡ€Π²ΠΈΡ Ρ€Π΅Π΄ с ΠΊΠ°ΠΊΠ²ΠΎΡ‚ΠΎ Π²ΠΈ Π΅ Π½ΡƒΠΆΠ½ΠΎ (apt-get, yum, Ρ‚.Π½.)

def draw_decision_tree(classifier):
    dot_data = tree.export_graphviz(classifier, out_file=None) 
    graph = graphviz.Source(dot_data) 
    graph.render("tree") 

    dot_data = tree.export_graphviz(classifier, out_file=None, 
                             feature_names=np.array(['X_0 Vertical', 'X_1 Horizontal']),  
                             class_names=np.array(['Class_0', 'Class_1']),  
                             filled=True, rounded=True,  
                             special_characters=True)

    return graphviz.Source(dot_data) 
In [17]:
draw_decision_tree(classifier)
Out[17]:
Tree 0 X_0 Vertical ≀ -0.5159 gini = 0.4992 samples = 100 value = [48, 52] class = Class_1 1 gini = 0.1327 samples = 42 value = [39, 3] class = Class_0 0->1 True 2 X_1 Horizontal ≀ -1.6008 gini = 0.2622 samples = 58 value = [9, 49] class = Class_1 0->2 False 3 gini = 0.375 samples = 12 value = [9, 3] class = Class_0 2->3 4 gini = 0.0 samples = 46 value = [0, 46] class = Class_1 2->4

Π’ΠΎΠ²Π° Π΅ Π΄ΡŠΡ€Π²ΠΎΡ‚ΠΎ Π½Π° Π°Π»Π³ΠΎΡ€ΠΈΡ‚ΡŠΠΌΠ° с max_samples_split=50. МоТС Π΄Π° Π²ΠΈΠ΄ΠΈΡ‚Π΅ няколко Π½Π΅Ρ‰Π°.

ΠŸΡŠΡ€Π²ΠΎ, Π°Π»Π³ΠΎΡ€ΠΈΡ‚ΡŠΠΌΠ° сС ΠΎΠΏΠΈΡ‚Π²Π° Π΄Π° Π½Π°ΠΌΠ΅Ρ€ΠΈ Π΄ΡŠΡ€Π²ΠΎ ΠΎΡ‚ if-ΠΎΠ²Π΅ с ΠΊΠΎΠΈΡ‚ΠΎ Π΄Π° ΠΎΡ‚Π³ΠΎΠ²ΠΎΡ€ΠΈ Π½Π° класификационния Π²ΡŠΠΏΡ€ΠΎΡ. ΠŸΡŠΡ€Π²ΠΎΠ½Π°Ρ‡Π°Π»Π½ΠΎ сравнява $x_0$ (ΠΏΡŠΡ€Π²ΠΈΡ feature Π² X с $-0.5159$). Ако Π΅ ΠΏΠΎ-ΠΌΠ°Π»ΠΊΠΎ, отговаря с оранТСвия клас. Ако Π΅ ΠΏΠΎ-голямо, сравнява $x_1$ с $-1.6008$ ΠΈ Π½Π° Π±Π°Π·Π°Ρ‚Π° Π½Π° Ρ€Π΅Π·ΡƒΠ»Ρ‚Π°Ρ‚Π° отговаря Π²Π·Π΅ΠΌΠ° Ρ€Π΅ΡˆΠ΅Π½ΠΈΠ΅. Π’ΠΎΠ²Π° ΡΡŠΠΎΡ‚Π²Π΅Ρ‚ΡΡ‚Π²Π° Π½Π° ΠΏΡ€Π°Π²ΠΎΡŠΠ³ΡŠΠ»Π½ΠΈΡ Π² опростСния Π°Π»Π³ΠΎΡ€ΠΈΡ‚ΡŠΠΌ.

Π’ΡŠΠ² samples ΠΌΠΎΠΆΠ΅ Π΄Π° Π²ΠΈΠ΄ΠΈΡ‚Π΅ ΠΊΠΎΠ»ΠΊΠΎ Ρ‚ΠΎΡ‡ΠΊΠΈ остават слСд Π²Π·Π΅ΠΌΠ°Π½Π΅Ρ‚ΠΎ Π½Π° Ρ€Π΅ΡˆΠ΅Π½ΠΈΠ΅. НапримСр, ΠΏΡŠΡ€Π²ΠΈΡ възСл ($x_0 < 0.5159$) Ρ‰Π΅ Ρ€Π°Π·Π΄Π΅Π»ΠΈ 100-Ρ‚Π΅ Ρ‚ΠΎΡ‡ΠΊΠΈ Π½Π° Π΄Π²Π° Π³Ρ€ΡƒΠΏΠΈ, Π΅Π΄Π½Π° с 42 ΠΈ Π΄Ρ€ΡƒΠ³Π° с 58. Π’ΡŠΠ² value ΠΌΠΎΠΆΠ΅ Π΄Π° Π²ΠΈΠ΄ΠΈΡ‚Π΅ ΠΏΠΎ ΠΊΠΎΠ»ΠΊΠΎ Ρ‚ΠΎΡ‡ΠΊΠΈ ΠΈΠΌΠ° ΠΎΡ‚ всСки клас. НапримСр, Π² лявото Ρ€Π°Π·ΠΊΠ»ΠΎΠ½Π΅Π½ΠΈΠ΅ слСд ΠΊΠΎΡ€Π΅Π½Π° Π½Π° Π΄ΡŠΡ€Π²ΠΎΡ‚ΠΎ ΠΏΠΎΠΏΠ°Π΄Π°ΠΌΠ΅ Π² ситуация, Π² която Π² train set-Π° ΠΈΠΌΠ° 39 ΠΎΡ€Π°Π½ΠΆΠ΅Π²ΠΈ ΠΈ 3 сини ΠΈ Π΄Π°Π²Π°ΠΌΠ΅ ΠΎΡ‚Π³ΠΎΠ²ΠΎΡ€ ΠΎΡ€Π°Π½ΠΆΠ΅Π²ΠΈ (Class_0). ΠΠ»Π³ΠΎΡ€ΠΈΡ‚ΡŠΠΌΠ° Π±ΠΈ могъл Π΄Π° ΠΏΡ€ΠΎΠ΄ΡŠΠ»ΠΆΠΈ Π΄Π° раздСля Π΄ΡŠΡ€Π²ΠΎΡ‚ΠΎ, Π½ΠΎ ΠΏΠΎΠ½Π΅ΠΆΠ΅ samples = 42 Π΅ ΠΏΠΎ-ΠΌΠ°Π»ΠΊΠΎ ΠΎΡ‚ min_samples_split = 50 Ρ‚ΠΎΠΉ спира Π΄Π° строи.

gini Π΅ ΠΏΠ°Ρ€Π°ΠΌΠ΅Ρ‚ΡŠΡ€, ΠΊΠΎΠΉΡ‚ΠΎ ΠΏΠΎΠΌΠ°Π³Π° Π½Π° Π°Π»Π³ΠΎΡ€ΠΈΡ‚ΡŠΠΌΠ° Π΄Π° ΠΏΡ€Π΅Ρ†Π΅Π½ΠΈ ΠΊΠ°ΠΊΠ²ΠΎ Π΅ Π½Π°ΠΉ-Π΄ΠΎΠ±Ρ€ΠΎΡ‚ΠΎ Ρ€Π°Π·ΠΏΡ€Π΅Π΄Π΅Π»Π΅Π½ΠΈΠ΅ Π΄Π° Π΄ΡŠΡ€Π²ΠΎΡ‚ΠΎ. Π©Π΅ Π³ΠΎ Ρ€Π°Π·Π³Π»Π΅Π΄Π°ΠΌΠ΅ ΠΏΠΎ-Π΄ΠΎΠ»Ρƒ.

НСка Π΄Π° Π²ΠΈΠ΄ΠΈΠΌ ΠΊΠ°ΠΊ Ρ‰Π΅ сС справи Π°Π»Π³ΠΎΡ€ΠΈΡ‚ΡŠΠΌΠ° Π±Π΅Π· рСгуляризация:

In [18]:
classifier = DecisionTreeClassifier().fit(X, y)

Π’Π°ΠΊΠ° ΠΈΠ·Π³Π»Π΅ΠΆΠ΄Π° decision boundary-Ρ‚ΠΎ:

In [19]:
plot_boundary(classifier, X, y)

Π’Π°ΠΊΠ° ΠΈΠ·Π³Π»Π΅ΠΆΠ΄Π° Π½Π΅Π³ΠΎΠ²ΠΎΡ‚ΠΎ Π΄ΡŠΡ€Π²ΠΎ:

In [20]:
draw_decision_tree(classifier)
Out[20]:
Tree 0 X_0 Vertical ≀ -0.5159 gini = 0.4992 samples = 100 value = [48, 52] class = Class_1 1 X_0 Vertical ≀ -0.9725 gini = 0.1327 samples = 42 value = [39, 3] class = Class_0 0->1 True 12 X_1 Horizontal ≀ -1.6008 gini = 0.2622 samples = 58 value = [9, 49] class = Class_1 0->12 False 2 gini = 0.0 samples = 28 value = [28, 0] class = Class_0 1->2 3 X_0 Vertical ≀ -0.9607 gini = 0.3367 samples = 14 value = [11, 3] class = Class_0 1->3 4 gini = 0.0 samples = 1 value = [0, 1] class = Class_1 3->4 5 X_1 Horizontal ≀ -0.9927 gini = 0.2604 samples = 13 value = [11, 2] class = Class_0 3->5 6 gini = 0.0 samples = 6 value = [6, 0] class = Class_0 5->6 7 X_0 Vertical ≀ -0.8159 gini = 0.4082 samples = 7 value = [5, 2] class = Class_0 5->7 8 gini = 0.0 samples = 4 value = [4, 0] class = Class_0 7->8 9 X_1 Horizontal ≀ 0.4175 gini = 0.4444 samples = 3 value = [1, 2] class = Class_1 7->9 10 gini = 0.0 samples = 2 value = [0, 2] class = Class_1 9->10 11 gini = 0.0 samples = 1 value = [1, 0] class = Class_0 9->11 13 X_0 Vertical ≀ 1.7032 gini = 0.375 samples = 12 value = [9, 3] class = Class_0 12->13 16 gini = 0.0 samples = 46 value = [0, 46] class = Class_1 12->16 14 gini = 0.0 samples = 9 value = [9, 0] class = Class_0 13->14 15 gini = 0.0 samples = 3 value = [0, 3] class = Class_1 13->15

МоТС Π΄Π° Π²ΠΈΠ΄ΠΈΡ‚Π΅, Ρ‡Π΅ Π΄ΡŠΡ€Π²ΠΎΡ‚ΠΎ Π΅ доста ΠΏΠΎ-слоТно, Π½ΠΎ Π·Π° смСтка Π½Π° Ρ‚ΠΎΠ²Π° ΠΎΡ‚Π³Π°Ρ‚Π²Π° train set-Π° ΠΏΠ΅Ρ€Ρ„Π΅ΠΊΡ‚Π½ΠΎ. Ако прослСдитС стойноститС, ΠΌΠΎΠΆΠ΅ Π΄Π° сС ΡƒΠ²Π΅Ρ€ΠΈΡ‚Π΅, Ρ‡Π΅ ΡΡŠΠΎΡ‚Π²Π΅Ρ‚ΡΡ‚Π²Π° Π½Π° Π³Ρ€Π°Ρ„ΠΈΠΊΠ°Ρ‚Π° ΠΏΠΎ-Π³ΠΎΡ€Π΅.

В sklearn е имплементиран CART алгоритъм.

Classification and Regression Trees

L. Breiman, J. Friedman, R. Olshen, and C. Stone. Classification and Regression Trees. Wadsworth, Belmont, CA, 1984

http://scikit-learn.org/stable/modules/tree.html#tree

Моделът CART е двоично дърво.

  • ВсСки Π΅Π»Π΅ΠΌΠ΅Π½Ρ‚ ΠΎΡ‚ Π΄ΡŠΡ€Π²ΠΎΡ‚ΠΎ ΠΌΠΎΠΆΠ΅ Π΄Π° ΠΈΠΌΠ° Π½ΡƒΠ»Π°, Π΅Π΄Π½ΠΎ ΠΈΠ»ΠΈ Π΄Π²Π΅ Π΄Π΅Ρ†Π°.
  • ΠΠ»Π³ΠΎΡ€ΠΈΡ‚ΡŠΠΌΡŠΡ‚ Ρ€Π°Π±ΠΎΡ‚ΠΈ рСкурсивно.
  • ΠšΡ€ΠΈΡ‚Π΅Ρ€ΠΈΠΈ Π·Π° спиранС ΠΌΠΎΠ³Π°Ρ‚ Π΄Π° Π±ΡŠΠ΄Π°Ρ‚:
    • Достигната Π΅ Π°Π±ΡΠΎΠ»ΡŽΡ‚Π½Π° чистота Π½Π° Π΅Π»Π΅ΠΌΠ΅Π½Ρ‚ΠΈΡ‚Π΅ Π² Π΄ΡŠΡ€Π²ΠΎΡ‚ΠΎ (останали са само Π΅Π΄ΠΈΠ½ клас Π΄Π°Π½Π½ΠΈ Π² послСднитС Π΄Π΅Ρ†Π°)
    • Достигната Π΅ максимална Π΄ΡŠΠ»Π±ΠΎΡ‡ΠΈΠ½Π° Π½Π° Π΄ΡŠΡ€Π²ΠΎΡ‚ΠΎ (max_depth)
    • Достигнат Π΅ ΠΌΠΈΠ½ΠΈΠΌΠ°Π»Π΅Π½ Π±Ρ€ΠΎΠΉ ΠΏΡ€ΠΈΠΌΠ΅Ρ€ΠΈ Π·Π° раздСлянС (min_samples_split)

Алгоритъмът е алчен (greedy):

  • ΠŸΡ€ΠΎΠ²Π΅Ρ€ΡΠ²Π° всяка възмоТна ΠΊΠΎΠ»ΠΎΠ½Π° (feature) Π·Π° всички възмоТни раздвоявания ΠΈ ΠΈΠ·Π±ΠΈΡ€Π° Π½Π°ΠΉ-Π΄ΠΎΠ±Ρ€ΠΎΡ‚ΠΎ.
  • БлоТност ΠΏΡ€ΠΈ Ρ‚Ρ€Π΅Π½ΠΈΡ€Π°Π½Π΅: $O(n_{features}n_{samples}\log(n_{samples}))$

Π‘Π»Π΅Π΄Π²Π° ΠΌΠ°Π»ΠΊΠΎ ΠΌΠ°Ρ‚Π΅ΠΌΠ°Ρ‚ΠΈΠΊΠ°, която Π½Π΅ Π΅ ΡΡŠΡ‰Π΅ΡΡ‚Π²Π΅Π½Π° Π·Π° Ρ€Π°Π·Π±ΠΈΡ€Π°Π½Π΅ Π½Π° Ρ‚ΠΎΠ²Π° ΠΊΠ°ΠΊΠ²ΠΎ сС случва, Π½ΠΎ описва ΠΊΠ°Ρ€Ρ‚ΠΈΠ½ΠΊΠ°Ρ‚Π° Π΅Π΄Π½Π° идСя ΠΏΠΎ-Π΄ΠΎΠ±Ρ€Π΅:

Оценяваща функция:

ΠžΡ†Π΅Π½ΡΠ²Π°Ρ‰Π°Ρ‚Π° функция Ρ€Π°Π±ΠΎΡ‚ΠΈ, Ρ‡Ρ€Π΅Π· Information Gain:

$$ InformationGain = impurity(parent) - {WeightedAverageBySamplesCount}\sum{impurity(children)}$$

$impurity$ Π΅ функция, която ΠΈΠ·ΠΌΠ΅Ρ€Π²Π° "примСситС". Π’ Π°Π»Π³ΠΎΡ€ΠΈΡ‚ΡŠΠΌΠ° ΠΌΠΎΠ³Π°Ρ‚ Π΄Π° сС ΠΏΠΎΠ»Π·Π²Π°Ρ‚ Ρ€Π°Π·Π»ΠΈΡ‡Π½ΠΈ Ρ„ΡƒΠ½ΠΊΡ†ΠΈΠΈ:

  • ΠŸΡ€ΠΈ класификация
    1. Ентропия (Entropy)
    2. Gini
    3. НСправилна класификация (Misclassification)
  • ΠŸΡ€ΠΈ рСгрСсия
    1. Π‘Ρ€Π΅Π΄Π½ΠΎ Π°Ρ€ΠΈΡ‚ΠΌΠ΅Ρ‚ΠΈΡ‡Π½ΠΎ ΠΎΡ‚ Ρ€Π°Π·Π»ΠΈΠΊΠ°Ρ‚Π° Π½Π° ΠΊΠ²Π°Π΄Ρ€Π°Ρ‚ΠΈΡ‚Π΅ (Mean Squared Error)
    2. Π‘Ρ€Π΅Π΄Π½ΠΎ Π°Ρ€ΠΈΡ‚ΠΌΠ΅Ρ‚ΠΈΡ‡Π½ΠΎ ΠΎΡ‚ Π°Π±ΡΠΎΠ»ΡŽΡ‚Π½Π°Ρ‚Π° стойност Π½Π° Ρ€Π°Π·Π»ΠΈΠΊΠ°Ρ‚Π° (Mean Absolute Error)

ΠŸΡ€ΠΈ ΠΏΡ€Π΅Π΄Π²ΠΈΠΆΠ΄Π°Π½Π΅ Π½Π° Π½ΠΎΠ² запис, Π°Π»Π³ΠΎΡ€ΠΈΡ‚ΡŠΠΌΡŠΡ‚ сС спуска ΠΏΠΎ построСното Π΄ΡŠΡ€Π²ΠΎ Π΄ΠΎΠΊΠ°Ρ‚ΠΎ стигнС възСл Π±Π΅Π· наслСдници.

  • ΠŸΡ€Π΅Π΄Π²ΠΈΠΆΠ΄Π° срСдната стойнoст ΠΎΡ‚ записитС останали Π² послСдния Π΅Π»Π΅ΠΌΠ΅Π½Ρ‚ ΠΏΡ€ΠΈ рСгрСсия.
  • ΠŸΡ€ΠΈ класификация ΠΈΠ·Π±ΠΈΡ€Π° класа прСдставСн ΠΎΡ‚ мнозинството ΠΎΡ‚ записи останали Π² послСдния Π΅Π»Π΅ΠΌΠ΅Π½Ρ‚.
  • БлоТност ΠΏΡ€ΠΈ ΠΏΡ€Π΅Π΄Π²ΠΈΠΆΠ΄Π°Π½Π΅: $O(\log(n_{samples}))$

Функции за измерване на примесите (impurity)

1. Ентропия

$entropy = \sum_i{ - p_i log_2( p_i )}$

$p_i= \frac{size of class_i}{size of set}$

Пример 1:

ИмамС мноТСство:

['Π›ΡŠΡ‡ΠΎ', 'Π›ΡŠΡ‡ΠΎ', 'Π‘Ρ‚Π΅Ρ„Π°Π½', 'Π‘Ρ‚Π΅Ρ„Π°Π½']

Π›ΡŠΡ‡ΠΎ - 2, Π‘Ρ‚Π΅Ρ„Π°Π½ - 2

ΠŸΡ€ΠΎΠΏΠΎΡ€Ρ†ΠΈΠΈ:

Π›ΡŠΡ‡ΠΎ: $\frac{2}{4}$, Π‘Ρ‚Π΅Ρ„Π°Π½: $\frac{2}{4}$

Бтойност Π·Π° Π›ΡŠΡ‡ΠΎ: $$-\frac{2}{4} * log_2(\frac{2}{4})$$ $$-0.5 * -1$$ $$0.5$$

Π‘Ρ‚Π΅Ρ„Π°Π½ ΠΈΠΌΠ° ΡΡŠΡ‰Π°Ρ‚Π° стойност $0.5$.

Ентропията Π½Π° мноТСството Π΅ $$entropy=0.5+0.5 = 1.0$$

Пример 2:

['ΠšΡ€ΡƒΡˆΠΈ', 'ΠšΡ€ΡƒΡˆΠΈ', 'ΠšΡ€ΡƒΡˆΠΈ']

ΠšΡ€ΡƒΡˆΠΈ - 3

ΠŸΡ€ΠΎΠΏΠΎΡ€Ρ†ΠΈΠΈ:

ΠšΡ€ΡƒΡˆΠΈ $\frac{3}{3} = 1$

$$-1 * log_2(1)$$ $$-1 * 0$$ $$entropy=0$$

Π•Ρ‚ΠΎ функция, която ΠΈΠ·ΠΌΠ΅Ρ€Π²Π° Ρ‚ΠΎΠ²Π°:

In [21]:
def entropy(subset_counts):
    subset_counts = np.array(subset_counts)
    subset_counts_normalized = subset_counts / subset_counts.sum()
    
    entropy = sum([-subset_count * np.log2(subset_count + 0.000000000001) 
                   for subset_count in subset_counts_normalized])
    
    entropy = np.round(entropy, 4)
    print('Entropy for', subset_counts, 'is', entropy)
    return entropy

ΠžΠ±ΡŠΡ€Π½Π΅Ρ‚Π΅ Π²Π½ΠΈΠΌΠ°Π½ΠΈΠ΅ Π½Π° + 0.000000000001 Π² Π»ΠΎΠ³Π°Ρ€ΠΈΡ‚ΡŠΠΌΠ°. ΠœΠΈΡΠ»Π΅Ρ‚Π΅ Π·Π° Ρ‚ΠΎΠ²Π° ΠΊΠ°Ρ‚ΠΎ Ρ…Π°ΠΊ, с ΠΊΠΎΠΉΡ‚ΠΎ са си спСстим ΠΌΠ°Π»ΠΊΠΎ смСтки Π·Π° Π΄Π° ΠΈΠ·Π±Π΅Π³Π½Π΅ΠΌ смятана Π½Π° Π»ΠΎΠ³Π°Ρ€ΠΈΡ‚ΡŠΠΌ ΠΎΡ‚ 0.

In [22]:
samples = [[2, 0], [1, 0], [9, 1], [4, 4]]

for sample in samples:
    entropy(sample)
Entropy for [2 0] is -0.0
Entropy for [1 0] is -0.0
Entropy for [9 1] is 0.469
Entropy for [4 4] is 1.0

ΠŸΡ€ΠΈ Π΄Π²Π° класа Снтропията Ρ‰Π΅ Π΅ Π² мноТСството $\big[0, 1\big]$. ИмамС $1$ ΠΊΠΎΠ³Π°Ρ‚ΠΎ Π΄Π²Π°Ρ‚Π° класа са Ρ€Π°Π²Π½ΠΎΠΌΠ΅Ρ€Π½ΠΎ Ρ€Π°Π·ΠΏΡ€Π΅Π΄Π΅Π»Π΅Π½ΠΈ Π² мноТСството ΠΈ $0$, ΠΊΠΎΠ³Π°Ρ‚ΠΎ мноТСството ΡΡŠΠ΄ΡŠΡ€ΠΆΠ° Π΅Π»Π΅ΠΌΠ΅Π½Ρ‚ΠΈ само ΠΎΡ‚ Сдиния Π²ΠΈΠ΄.

ΠŸΡ€ΠΈ Ρ‚Ρ€ΠΈ ΠΈΠ»ΠΈ ΠΏΠΎΠ²Π΅Ρ‡Π΅ класа, функцията няма Π³ΠΎΡ€Π½Π° Π³Ρ€Π°Π½ΠΈΡ†Π°:

In [23]:
samples = [[2, 0, 1], [6, 0, 0], [9, 1, 0], [5, 5, 0], [5, 5, 5]]

for sample in samples:
    entropy(sample)
Entropy for [2 0 1] is 0.9183
Entropy for [6 0 0] is -0.0
Entropy for [9 1 0] is 0.469
Entropy for [5 5 0] is 1.0
Entropy for [5 5 5] is 1.585

2. Gini

$H(X_m) = \sum_k p_{mk} (1 - p_{mk})$

$p_{mk}$ ΠΎΡ‚Π½ΠΎΠ²ΠΎ Π΅ пропорцията Π½Π° класа Π² мноТСството.

Пример:

['Π›ΡŠΡ‡ΠΎ', 'Π›ΡŠΡ‡ΠΎ', 'Π‘Ρ‚Π΅Ρ„Π°Π½', Π‘Ρ‚Π΅Ρ„Π°Π½']

Π›ΡŠΡ‡ΠΎ - 2, Π‘Ρ‚Π΅Ρ„Π°Π½ - 2

ΠŸΡ€ΠΎΠΏΠΎΡ€Ρ†ΠΈΠΈ:

Π›ΡŠΡ‡ΠΎ: $\frac{2}{4}$, Π‘Ρ‚Π΅Ρ„Π°Π½: $\frac{2}{4}$

Бтойност Π·Π° Π›ΡŠΡ‡ΠΎ: $$\frac{2}{4} * (1 - \frac{2}{4})$$ $$0.5 * 0.5$$ $$0.25$$

Π‘Ρ‚Π΅Ρ„Π°Π½ ΠΈΠΌΠ° ΡΡŠΡ‰Π°Ρ‚Π° стойност $0.25$.

$$gini=0.25+0.25 = 0.5$$

Пример 2:

['ΠšΡ€ΡƒΡˆΠΈ', 'ΠšΡ€ΡƒΡˆΠΈ', 'ΠšΡ€ΡƒΡˆΠΈ']

ΠšΡ€ΡƒΡˆΠΈ - 3

ΠŸΡ€ΠΎΠΏΠΎΡ€Ρ†ΠΈΠΈ:

ΠšΡ€ΡƒΡˆΠΈ $\frac{3}{3} = 1$

$$1 * (1 - 1)$$ $$1 * 0$$ $$gini=0$$

In [24]:
def gini_impurity(subset_counts):
    subset_counts = np.array(subset_counts)
    subset_counts_normalized = subset_counts / subset_counts.sum()
    
    impurity = sum([subset_count * (1 - subset_count) 
                    for subset_count in subset_counts_normalized])
    
    print('Gini impurity for', subset_counts, 'is', impurity)
    return impurity
In [25]:
samples = [[2, 0], [1, 0], [9, 1], [4, 4]]

for sample in samples:
    gini_impurity(sample)
Gini impurity for [2 0] is 0.0
Gini impurity for [1 0] is 0.0
Gini impurity for [9 1] is 0.18
Gini impurity for [4 4] is 0.5
In [26]:
samples = [[2, 0, 1], [6, 0, 0], [9, 1, 0], [5, 5, 0], [5, 5, 5]]

for sample in samples:
    gini_impurity(sample)
Gini impurity for [2 0 1] is 0.444444444444
Gini impurity for [6 0 0] is 0.0
Gini impurity for [9 1 0] is 0.18
Gini impurity for [5 5 0] is 0.5
Gini impurity for [5 5 5] is 0.666666666667

3. Неправилна класификация (Misclassification)

$H(X_m) = 1 - \max(p_{mk})$

In [27]:
def missclassification_impurity(subset_counts):
    subset_counts = np.array(subset_counts)
    subset_counts_normalized = subset_counts / subset_counts.sum()
    
    impurity = 1 - max(subset_counts_normalized)
    
    print('Misclassification impurity for', subset_counts, 'is', impurity)
    return impurity
In [28]:
samples = [[2, 0], [1, 0], [9, 1], [4, 4]]

for sample in samples:
    missclassification_impurity(sample)
Misclassification impurity for [2 0] is 0.0
Misclassification impurity for [1 0] is 0.0
Misclassification impurity for [9 1] is 0.1
Misclassification impurity for [4 4] is 0.5
In [29]:
samples = [[2, 0, 1], [6, 0, 0], [9, 1, 0], [5, 5, 0], [5, 5, 5]]

for sample in samples:
    missclassification_impurity(sample)
Misclassification impurity for [2 0 1] is 0.333333333333
Misclassification impurity for [6 0 0] is 0.0
Misclassification impurity for [9 1 0] is 0.1
Misclassification impurity for [5 5 0] is 0.5
Misclassification impurity for [5 5 5] is 0.666666666667

ΠšΠΎΠ³Π°Ρ‚ΠΎ конструиратС Π΄ΡŠΡ€Π²ΠΎ, ΠΌΠΎΠΆΠ΅ Π΄Π° ΠΏΠΎΠ΄Π°Π΄Π΅Ρ‚Π΅ ΠΊΡ€ΠΈΡ‚Π΅Ρ€ΠΈΠΉ коя функция Π΄Π° сС ΠΏΠΎΠ»Π·Π²Π°. Π’ΠΎΠ²Π° Π΅ ΠΏΠ°Ρ€Π°ΠΌΠ΅Ρ‚ΡŠΡ€Π° criterion. scikit-learn ΠΏΠΎΠ΄Π΄ΡŠΡ€ΠΆΠ° само gini ΠΈ entropy. ΠŸΠΎΠ²Π΅Ρ‡Π΅ информация ΠΌΠΎΠΆΠ΅ Π΄Π° Π½Π°ΠΌΠ΅Ρ€ΠΈΡ‚Π΅ Π² докумСнтацията:

http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

На всяка ΡΡ‚ΡŠΠΏΠΊΠ° Π°Π»Π³ΠΎΡ€ΠΈΡ‚ΡŠΠΌΠ° сС ΠΎΠΏΠΈΡ‚Π²Π° Π΄Π° Π½Π°ΠΌΠ΅Ρ€ΠΈ нСравСнство, ΠΊΠΎΠ΅Ρ‚ΠΎ Π΄Π° ΠΎΠΏΡ‚ΠΈΠΌΠΈΠ·ΠΈΡ€Π° information gain-Π°. Π’ΠΎΠΉ сС ΠΏΠΎΠ»ΡƒΡ‡Π°Π²Π° ΠΏΠΎ слСдната Ρ„ΠΎΡ€ΠΌΡƒΠ»Π°:

$ InformationGain = impurity(parent) - \sum_k{p_{mk}}{impurity(children)}$

МоТС Π΄Π° Π²ΠΈΠ΄ΠΈΠΌ Ρ€Π°Π·Π»ΠΈΡ‡Π½ΠΈΡ‚Π΅ Ρ€Π΅Π·ΡƒΠ»Ρ‚Π°Ρ‚ΠΈ, ΠΊΠΎΠΈΡ‚ΠΎ Π±ΠΈΡ…ΠΌΠ΅ ΠΈΠΌΠ°Π»ΠΈ ΠΏΡ€ΠΈ Ρ€Π°Π·Π»ΠΈΡ‡Π½ΠΈΡ‚Π΅ Ρ„ΡƒΠ½ΠΊΡ†ΠΈΠΈ Π·Π° ΠΎΡ†Π΅Π½ΠΊΠ° Π½Π° примСситС:

In [30]:
def information_gain(subsets, parent_impurity=1, f=entropy):
    total_count = sum(sum(i) for i in subsets)
    print("Total count:", total_count)
    
    subsets_impurity = sum((sum(subset) / total_count * f(subset) for subset in subsets))
    gain = parent_impurity - subsets_impurity
    print("Information gain:", gain)
    return gain
In [31]:
subsets = [[2, 1], [1]]
for f in [entropy, gini_impurity, missclassification_impurity]:
    information_gain(subsets, parent_impurity=1, f=f); 
    print();
Total count: 4
Entropy for [2 1] is 0.9183
Entropy for [1] is -0.0
Information gain: 0.311275

Total count: 4
Gini impurity for [2 1] is 0.444444444444
Gini impurity for [1] is 0.0
Information gain: 0.666666666667

Total count: 4
Misclassification impurity for [2 1] is 0.333333333333
Misclassification impurity for [1] is 0.0
Information gain: 0.75

In [32]:
subsets = [[2], [2]]
for f in [entropy, gini_impurity, missclassification_impurity]:
    information_gain(subsets, parent_impurity=1, f=f); 
    print();
Total count: 4
Entropy for [2] is -0.0
Entropy for [2] is -0.0
Information gain: 1.0

Total count: 4
Gini impurity for [2] is 0.0
Gini impurity for [2] is 0.0
Information gain: 1.0

Total count: 4
Misclassification impurity for [2] is 0.0
Misclassification impurity for [2] is 0.0
Information gain: 1.0

In [33]:
gini_impurity([48, 52])
gini_impurity([9, 49])
gini_impurity([39, 3])
gini_impurity([0, 46]);
Gini impurity for [48 52] is 0.4992
Gini impurity for [ 9 49] is 0.262187871581
Gini impurity for [39  3] is 0.132653061224
Gini impurity for [ 0 46] is 0.0

Π”ΠΎ Ρ‚ΡƒΠΊ с ΠΌΠ°Ρ‚Π΅ΠΌΠ°Ρ‚ΠΈΠΊΠ°Ρ‚Π°. НС Π΅ Π½ΡƒΠΆΠ½ΠΎ Π΄Π° Π·Π°ΠΏΠΎΠΌΠ½ΠΈΡ‚Π΅ всичко, Π½ΠΎ Π°ΠΊΠΎ Π²ΠΈ сС Π½Π°Π»Π°Π³Π° Π΄Π° Π΄Π΅Π±ΡŠΠ³Π²Π°Ρ‚Π΅ ΠΊΠ°ΠΊΠ²ΠΎ Π΅ ΠΎΡ‚ΠΊΡ€ΠΈΠ» Π°Π»Π³ΠΎΡ€ΠΈΡ‚ΡŠΠΌΠ°, Ρ‚ΠΎΠ²Π° ΠΌΠΎΠΆΠ΅ Π΄Π° Π²ΠΈ ΠΏΠΎΠΌΠΎΠ³Π½Π΅ Π΄Π° си Π³ΠΎ обяснитС.

Плюсове на decision tree

  • Π”ΡŠΡ€Π²ΠΎΡ‚ΠΎ Π΅ лСсно Π·Π° интСрпрСтация.
  • ЛСсно сС справя с ΠΈΡ€Π΅Π»Π΅Π²Π°Π½Ρ‚Π½ΠΈ feature-ΠΈ (gain = 0).
  • МоТС Π΄Π° сС справи с липсващи Π΄Π°Π½Π½ΠΈ (ΠΌΠ°ΠΊΠ°Ρ€ ΠΈ Π½Π΅ Π·Π° Ρ‚Π΅ΠΊΡƒΡ‰Π°Ρ‚Π° имплСмСнтация Π² sklearn).
  • ΠšΠΎΠΌΠΏΠ°ΠΊΡ‚Π½ΠΎ прСдставянС Π½Π° ΠΌΠΎΠ΄Π΅Π»Π°.
  • Π‘ΡŠΡ€Π· ΠΏΡ€ΠΈ прСдсказванС (Π»ΠΈΠ½Π΅Π΅Π½ Π½Π° Π΄ΡŠΠ»Π±ΠΎΡ‡ΠΈΠ½Π°Ρ‚Π° Π½Π° Π΄ΡŠΡ€Π²ΠΎΡ‚ΠΎ).
  • МоТС Π΄Π° ΠΏΡ€Π°Π²ΠΈ класификация с ΠΏΠΎΠ²Π΅Ρ‡Π΅ класовС Π±Π΅Π· Π΄ΠΎΠΏΡŠΠ»Π½ΠΈΡ‚Π΅Π»Π½ΠΈ Ρ‚Ρ€ΠΈΠΊΠΎΠ²Π΅.
  • ЛСсСн Π·Π° ΠΈΠ·ΠΏΠΎΠ»Π·Π²Π°Π½Π΅ ΠΈ Π΄Π°Π²Π° Π΄ΠΎΠ±Ρ€ΠΈ Ρ€Π΅Π·ΡƒΠ»Ρ‚Π°Ρ‚ΠΈ с ΠΌΠ°Π»ΠΊΠΎ СкспСримСнти.

Минуси

  • РаздСля Π°Ρ‚Ρ€ΠΈΠ±ΡƒΡ‚ΠΈΡ‚Π΅ само ΠΏΠΎ оситС.
  • АлчСн (greedy) - ΠΌΠΎΠΆΠ΅ Π΄Π° Π½Π΅ ΠΎΡ‚ΠΊΡ€ΠΈΠ΅ Π½Π°ΠΉ-Π΄ΠΎΠ±Ρ€ΠΎΡ‚ΠΎ Π΄ΡŠΡ€Π²ΠΎ.
  • ЕкспонСнциално нарастванС Π½Π° Π²ΡŠΠ·ΠΌΠΎΠΆΠ½ΠΈΡ‚Π΅ Π΄ΡŠΡ€Π²Π΅Ρ‚Π°.
  • ΠžΠ²ΡŠΡ€Ρ„ΠΈΡ‚Π²Π° силно.

Random Forest

Π—Π° Π΄Π° сС справим с ΠΏΡ€ΠΎΠ±Π»Π΅ΠΌΠΈΡ‚Π΅ Π½Π° decision tree ΠΎΠ±ΠΈΠΊΠ½ΠΎΠ²Π΅Π½ΠΎ ΠΏΠΎΠ»Π·Π²Π°ΠΌΠ΅ Π½Π΅Ρ‰ΠΎ, Π½Π°Ρ€Π΅Ρ‡Π΅Π½ΠΎ random forest. Π’ΠΎ Ρ€Π°Π±ΠΎΡ‚ΠΈ ΠΏΠΎ слСдния Π½Π°Ρ‡ΠΈΠ½:

  • Π“Π΅Π½Π΅Ρ€ΠΈΡ€Π° ΠΌΠ½ΠΎΠ³ΠΎ Π½Π° Π±Ρ€ΠΎΠΉ Π΄ΡŠΡ€Π²Π΅Ρ‚Π° (10, 100, 1000, Ρ‚.Π½.).
  • Всяко Π΄ΡŠΡ€Π²ΠΎ Π΅ Π³Π΅Π½Π΅Ρ€ΠΈΡ€Π°Π½ΠΎ с Ρ€Π°Π·Π»ΠΈΡ‡Π½ΠΈ ΠΊΡ€ΠΈΡ‚Π΅Ρ€ΠΈΠΈ ΠΎΡ‚ останалитС (подмноТСство Π½Π° Π΄Π°Π½Π½ΠΈΡ‚Π΅, подмноТСство Π½Π° feature-ΠΈΡ‚Π΅ ΠΈ Ρ‚.Π½.).
  • Π—Π° Π΄Π° сС ΠΎΡ‚Π³ΠΎΠ²ΠΎΡ€ΠΈ Π½Π° Π²ΡŠΠΏΡ€ΠΎΡ, Π·Π°Π΄Π°Π²Π°ΠΌΠ΅ Π³ΠΎ Π½Π° всички Π΄ΡŠΡ€Π²Π΅Ρ‚Π° ΠΈ Π²Π·Π΅ΠΌΠ°ΠΌΠ΅ Π½Π°ΠΉ-популярния ΠΎΡ‚Π³ΠΎΠ²ΠΎΡ€.

Π•Ρ‚ΠΎ някои ΠΎΡ‚ ΠΏΡ€ΠΈΡ‡ΠΈΠ½ΠΈΡ‚Π΅ Ρ‚ΠΎΠ²Π° Π΄Π° Ρ€Π°Π±ΠΎΡ‚ΠΈ ΠΏΠΎ-Π΄ΠΎΠ±Ρ€Π΅:

  • Π˜Π½Π΄ΠΈΠ²ΠΈΠ΄ΡƒΠ°Π»Π½ΠΈΡ‚Π΅ Π΄ΡŠΡ€Π²Π΅Ρ‚Π° Ρ‰Π΅ правя overfitting, Π½ΠΎ ΠΏΠΎ Ρ€Π°Π·Π»ΠΈΡ‡Π΅Π½ Π½Π°Ρ‡ΠΈΠ½. ΠžΡΡ€Π΅Π΄Π½Π΅Π½ΠΈΡΡ‚ Ρ€Π΅Π·ΡƒΠ»Ρ‚Π°Ρ‚ ΠΌΠ΅ΠΆΠ΄Ρƒ тях Π±ΠΈ трябвало Π΄Π° бъдС ΠΏΠΎ-рСгуляризиран.
  • Ако ΠΈΠΌΠ° Π°Π½ΠΎΠΌΠ°Π»ΠΈΠΈ Π² Π΄Π°Π½Π½ΠΈΡ‚Π΅, Ρ‚Π΅ Ρ‰Π΅ участват Π² ΠΏΠΎ-малък Π±Ρ€ΠΎΠΉ Π΄ΡŠΡ€Π²Π΅Ρ‚Π°, ΡΡŠΠΎΡ‚Π²Π΅Ρ‚Π½ΠΎ ΠΏΠΎ-ΠΌΠ°Π»ΠΊΠΎ Π΄ΡŠΡ€Π²Π΅Ρ‚Π° Ρ‰Π΅ overfit-Π²Π°Ρ‚.
  • Вариацията Π² ΠΊΠΎΠΈ feature сС ΠΏΠΎΠ»Π·Π²Π°Ρ‚ Ρ‰Π΅ изслСдва Ρ€Π°Π·Π»ΠΈΡ‡Π½ΠΈ Π½Π°Ρ‡ΠΈΠ½ΠΈ Π΄Π° ΠΎΡ‚Π³ΠΎΠ²ΠΎΡ€ΠΈΠΌ Π½Π° Π²ΡŠΠΏΡ€ΠΎΡΠ° ΠΈ Π΄ΠΎΠΊΠ°Ρ‚ΠΎ всСки Ρ‰Π΅ ΠΏΡ€Π°Π²ΠΈ Π³Ρ€Π΅ΡˆΠΊΠΈ, ΠΏΠΎ-ΠΌΠ°Π»ΠΊΠΎ вСроятно Π΅ всички Π΄Π° правят Π΅Π΄Π½ΠΈ ΠΈ ΡΡŠΡ‰ΠΈ Π³Ρ€Π΅ΡˆΠΊΠΈ.

Π’ scikit-learn ΠΏΠΎΠ»Π·Π²Π°ΠΌΠ΅ random forest ΠΏΠΎ слСдния Π½Π°Ρ‡ΠΈΠ½:

In [34]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(random_state=23).fit(X, y) # Π±Π΅Π· Π½Π°Ρ‚Ρ€ΠΎΠΉΠΊΠ° Π½Π° ΠΏΠ°Ρ€Π°ΠΌΠ΅Ρ‚Ρ€ΠΈΡ‚Π΅
plot_boundary(classifier, X, y)

(ΠΏΠΎΠ΄Π°Π»ΠΈ смС random_state=23 Π·Π° Π΄Π° ΠΏΠΎΠ»ΡƒΡ‡Π°Π²Π°ΠΌΠ΅ дСтСрминистичСн ΠΎΡ‚Π³ΠΎΠ²ΠΎΡ€ – Π°ΠΊΠΎ ΠΏΡ€ΠΎΠΌΠ΅Π½ΠΈΡ‚Π΅ ΠΏΠ°Ρ€Π°ΠΌΠ΅Ρ‚ΡŠΡ€Π° Ρ‰Π΅ бъдС Π½Π°ΠΌΠ΅Ρ€Π΅Π½Π° Π΄Ρ€ΡƒΠ³Π° Π³ΠΎΡ€Π°, ΠΊΠΎΠ΅Ρ‚ΠΎ Ρ‰Π΅ ΠΏΡ€Π°Π²ΠΈ Π³Π΅Π½Π΅Ρ€ΠΈΡ€Π°Π½Π΅Ρ‚ΠΎ Π½Π° Π΅Π΄Π½ΠΈ ΠΈ ΡΡŠΡ‰ΠΈ Π³Ρ€Π°Ρ„ΠΈΠΊΠΈ Π² Ρ‚ΠΎΠ·ΠΈ notebook Ρ‚Ρ€ΡƒΠ΄Π½ΠΎ)

Параметри за RF:

  • n_estimators – Π±Ρ€ΠΎΠΉ Π΄ΡŠΡ€Π²Π΅Ρ‚Π° (10, 100, 1000)
  • criterion – Π·Π° всички Π΄ΡŠΡ€Π²Π΅Ρ‚Π° (gini, entropy)
  • max_features – ΠΊΠΎΠ»ΠΊΠΎ feature Π΄Π° сС ΠΏΡ€ΠΎΠ±Π²Π°Ρ‚ ΠΏΡ€ΠΈ Ρ‚ΡŠΡ€ΡΠ΅Π½Π΅ Π½Π° Π½Π°ΠΉ-Π΄ΠΎΠ±Ρ€ΠΎ раздСлянС. По ΠΏΠΎΠ΄Ρ€Π°Π·Π±ΠΈΡ€Π°Π½Π΅ Π΅ sqrt(n_features), ΠΊΠ°Ρ‚ΠΎ са Ρ€Π°Π·Π»ΠΈΡ‡Π½ΠΈ ΠΏΡ€ΠΈ всяко Π½ΠΎΠ²ΠΎ Ρ‚ΡŠΡ€ΡΠ΅Π½Π΅.
  • max_depth – максимална Π΄ΡŠΠ»Π±ΠΎΡ‡ΠΈΠ½Π° Π½Π° Π΄ΡŠΡ€Π²Π΅Ρ‚Π°Ρ‚Π°.
  • min_samples_split – ΠΌΠΈΠ½ΠΈΠΌΠ°Π»Π΅Π½ Π±Ρ€ΠΎΠΉ сСмпли Π·Π° Π΄Π° ΠΌΠΎΠΆΠ΅ Π΄Π° сС Ρ€Π°Π·Π΄Π΅Π»ΠΈ възСла
  • bootstrap – Π²Ρ‚ΠΎΡ€ΠΈ ΠΏΠ°Ρ€Π°ΠΌΠ΅Ρ‚ΡŠΡ€ Π·Π° случайност - random sampling with replacement. Π’Π΅Π³Π»ΠΈ ΡΡŠΡ‰ΠΈΡ Π±Ρ€ΠΎΠΉ сСмпли ΠΊΠ°Ρ‚ΠΎ оригиналния сСт, Π½ΠΎ ΠΌΠΎΠΆΠ΅ Π΄Π° ΠΈΠ·Ρ‚Π΅Π³Π»ΠΈ Π΅Π΄ΠΈΠ½ Π΅Π»Π΅ΠΌΠ΅Π½ няколко ΠΏΡŠΡ‚ΠΈ (ΠΈ Π΄Π° Π½Π΅ ΠΈΠ·Ρ‚Π΅Π»ΠΈ Π΄Ρ€ΡƒΠ³)
  • n_jobs – Ρ‚Ρ€Π΅Π½ΠΈΡ€Π° ΠΏΠΎ няколко Π΄ΡŠΡ€Π²Π΅Ρ‚Π° Π΅Π΄Π½ΠΎΠ²Ρ€Π΅ΠΌΠ΅Π½Π½ΠΎ, Π½ΠΎ ΠΈΠ·ΠΏΠΎΠ»Π·Π²Π° ΠΏΠΎΠ²Π΅Ρ‡Π΅ ΠΏΠ°ΠΌΠ΅Ρ‚
  • random_state – Π·Π° Π²ΡŠΠ·ΠΏΡ€ΠΎΠΈΠ·Π²Π΅Π΄ΠΈΠΌΠΈ СкспСримСнти

Π•Ρ‚ΠΎ ΠΈ Π΄Π²Π΅ ΠΏΡ€ΠΎΠΈΠ·Π²ΠΎΠ»Π½ΠΈ Π΄ΡŠΡ€Π²Π΅Ρ‚Π°, Π±Π΅Π· фиксиран random_state:

In [35]:
classifer = RandomForestClassifier().fit(X, y)
plot_boundary(classifer, X, y)
In [36]:
classifer = RandomForestClassifier().fit(X, y)
plot_boundary(classifer, X, y)

Значимост на feature-ите в random forest

Π‘Ρ…ΠΎΠ΄Π½ΠΎ Π½Π° Ρ‚Π΅Π³Π»Π°Ρ‚Π° Π½Π° ΠΏΠ°Ρ€Π°ΠΌΠ΅Ρ‚Ρ€ΠΈΡ‚Π΅ ΠΏΡ€ΠΈ Π»ΠΈΠ½Π΅ΠΉΠ½ΠΈ ΠΌΠΎΠ΄Π΅Π»ΠΈ, random forest ΠΌΠΎΠΆΠ΅ Π΄Π° Π²ΠΈ ΠΊΠ°ΠΆΠ΅ ΠΊΠΎΠΈ feature-Π° Π½Π°ΠΌΠΈΡ€Π° Π·Π° Π½Π°ΠΉ-Π·Π½Π°Ρ‡ΠΈΠΌΠΈ (Ρ‚.Π΅. носят Π½Π°ΠΉ-ΠΌΠ½ΠΎΠ³ΠΎ информация).

МоТС Π΄Π° Π²ΠΈΠ΄ΠΈΡ‚Π΅ ΠΏΠΎΠ²Π΅Ρ‡Π΅ Ρ‚ΡƒΠΊ: http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html