Study/Machine Learning
Multilabel classification
Kisung Moon
2021. 5. 5. 10:06
In [63]:
import pandas as pd
import numpy as np
import time
# Load Data Viz Pkgs
import matplotlib.pyplot as plt
import seaborn as sns
# ML Pkgs
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
### Split Dataset into Train and Text
from sklearn.model_selection import train_test_split
# Multi Label Pkgs
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN
# confusion_matrix
from sklearn.metrics import multilabel_confusion_matrix, classification_report, accuracy_score, hamming_loss, f1_score, roc_curve, roc_auc_score
# Cross Validation
from sklearn.model_selection import cross_val_predict, GridSearchCV, validation_curve
# Ensenble
from sklearn.ensemble import RandomForestClassifier
In [2]:
df_features = pd.read_csv("Input/Robust_S7_Data.csv")
df_targets = pd.read_csv("Input/Robust_S6_Data.csv")
1. EDA¶
1.1 Feature¶
- 3574 row
- 41774 column
- 결핵균 sample의 유전 정보
- 1이면 해당 유전자 위치에 mutation
In [3]:
df_features.head()
Out[3]:
id | 31 | 64 | 238 | 266 | 326 | 328 | 351 | 365 | 526 | ... | 4409708 | 4409819 | 4409959 | 4409973 | 4410076 | 4410297 | 4410512 | 4410590 | 4410666 | 4410782 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ERR2512419 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | ERR2512432 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | ERR2512444 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | SRR6046861 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | ERR2512448 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 41774 columns
In [4]:
df_features.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3574 entries, 0 to 3573 Columns: 41774 entries, id to 4410782 dtypes: float64(41773), object(1) memory usage: 1.1+ GB
In [5]:
df_features.describe()
Out[5]:
31 | 64 | 238 | 266 | 326 | 328 | 351 | 365 | 526 | 573 | ... | 4409708 | 4409819 | 4409959 | 4409973 | 4410076 | 4410297 | 4410512 | 4410590 | 4410666 | 4410782 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 3569.000000 | 3572.000000 | 3571.000000 | 3572.000000 | 3571.000000 | 3571.000000 | 3572.000000 | 3572.000000 | 3573.00000 | 3571.000000 | ... | 3574.000000 | 3573.00000 | 3574.000000 | 3574.000000 | 3574.000000 | 3574.000000 | 3573.00000 | 3565.000000 | 3573.00000 | 3573.00000 |
mean | 0.000841 | 0.000560 | 0.001400 | 0.000280 | 0.000280 | 0.000280 | 0.000280 | 0.000560 | 0.00028 | 0.000280 | ... | 0.000280 | 0.00028 | 0.000280 | 0.000280 | 0.000280 | 0.000280 | 0.00028 | 0.000281 | 0.00028 | 0.00028 |
std | 0.028984 | 0.023659 | 0.037398 | 0.016732 | 0.016734 | 0.016734 | 0.016732 | 0.023659 | 0.01673 | 0.016734 | ... | 0.016727 | 0.01673 | 0.016727 | 0.016727 | 0.016727 | 0.016727 | 0.01673 | 0.016748 | 0.01673 | 0.01673 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.00000 | 0.00000 |
25% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.00000 | 0.00000 |
50% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.00000 | 0.00000 |
75% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.00000 | 0.00000 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | ... | 1.000000 | 1.00000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.00000 | 1.00000 |
8 rows × 41773 columns
1.2 Target¶
- Feature와 같은 row와 column수
- 결핵균 sample의 내성 정보
- 1이면 내성
- 여러 약물에 내성일 수 있기 때문에 multilabel
In [6]:
df_targets.head()
Out[6]:
id | publication | project | date | country | lineage | sublineage | ethambutol | isoniazid | pyrazinamide | rifampicin | streptomycin | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ERR2512419 | cryptic_nejm_2018 | cryptic_nejm_2018 | NaN | United Kingdom | lineage2 | lineage2.2.1 | 1.0 | 1.0 | 1.0 | 1.0 | NaN |
1 | ERR2512432 | cryptic_nejm_2018 | cryptic_nejm_2018 | NaN | United Kingdom | lineage2 | lineage2.2.1 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
2 | ERR2512444 | cryptic_nejm_2018 | cryptic_nejm_2018 | NaN | United Kingdom | lineage2 | lineage2.2.1.1 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
3 | SRR6046861 | cryptic_nejm_2018 | cryptic_nejm_2018 | NaN | United Kingdom | lineage2 | lineage2.2.1 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
4 | ERR2512448 | cryptic_nejm_2018 | cryptic_nejm_2018 | NaN | United Kingdom | lineage2 | lineage2.2.1 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
In [7]:
df_targets.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3574 entries, 0 to 3573 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 3574 non-null object 1 publication 3574 non-null object 2 project 3574 non-null object 3 date 1094 non-null float64 4 country 3144 non-null object 5 lineage 3574 non-null object 6 sublineage 3574 non-null object 7 ethambutol 3352 non-null float64 8 isoniazid 3501 non-null float64 9 pyrazinamide 2212 non-null float64 10 rifampicin 3547 non-null float64 11 streptomycin 1489 non-null float64 dtypes: float64(6), object(6) memory usage: 335.2+ KB
In [8]:
df_targets.describe()
Out[8]:
date | ethambutol | isoniazid | pyrazinamide | rifampicin | streptomycin | |
---|---|---|---|---|---|---|
count | 1094.000000 | 3352.000000 | 3501.000000 | 2212.000000 | 3547.000000 | 1489.000000 |
mean | 2009.158135 | 0.332637 | 0.554699 | 0.313743 | 0.527770 | 0.442579 |
std | 3.008017 | 0.471228 | 0.497070 | 0.464118 | 0.499299 | 0.496859 |
min | 1995.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 2008.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 2009.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 |
75% | 2010.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
max | 2016.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
In [9]:
df_targets['ethambutol'].value_counts().plot(kind='bar')
Out[9]:
<AxesSubplot:>
In [10]:
df_targets['isoniazid'].value_counts().plot(kind='bar')
Out[10]:
<AxesSubplot:>
In [11]:
df_targets['pyrazinamide'].value_counts().plot(kind='bar')
Out[11]:
<AxesSubplot:>
In [12]:
df_targets['rifampicin'].value_counts().plot(kind='bar')
Out[12]:
<AxesSubplot:>
In [13]:
df_targets['streptomycin'].value_counts().plot(kind='bar')
Out[13]:
<AxesSubplot:>
2. Data Preprocessing¶
2.1 Feature와 Target의 id가 일치하는지 확인 => 일치!¶
In [14]:
check = pd.Series(df_features['id'] == df_targets['id'])
In [15]:
check.unique()
Out[15]:
array([ True])
2.2 Target의 ['publication', 'project', 'date', 'country', 'lineage', 'sublineage'] column 제거¶
- drug에 대한 내성 정보만 남기고 모두 제거
In [16]:
df_targets.columns
Out[16]:
Index(['id', 'publication', 'project', 'date&##39;, 'country', 'lineage', 'sublineage', 'ethambutol', 'isoniazid', 'pyrazinamide', 'rifampicin', 'streptomycin'], dtype='object')
In [23]:
df_targets_fillna = df_targets[['ethambutol', 'isoniazid', 'pyrazinamide', 'rifampicin',
'streptomycin']]
df_targets_fillna.head()
Out[23]:
ethambutol | isoniazid | pyrazinamide | rifampicin | streptomycin | |
---|---|---|---|---|---|
0 | 1.0 | 1.0 | 1.0 | 1.0 | NaN |
1 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
2 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
3 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
4 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
2.3 결측치 치환¶
2.3.1 Features 결측치 = 0으로 치환¶
In [24]:
df_features.head()
Out[24]:
id | 31 | 64 | 238 | 266 | 326 | 328 | 351 | 365 | 526 | ... | 4409708 | 4409819 | 4409959 | 4409973 | 4410076 | 4410297 | 4410512 | 4410590 | 4410666 | 4410782 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ERR2512419 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | ERR2512432 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | ERR2512444 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | SRR6046861 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | ERR2512448 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 41774 columns
In [25]:
df_features_fillna = df_features.drop(['id'], axis=1)
In [26]:
len(df_features_fillna.isna())
Out[26]:
3574
In [27]:
df_features_fillna = df_features_fillna.fillna(0)
2.3.2 Targets 결측치 = 0으로 치환¶
In [28]:
df_targets_fillna.isna()
Out[28]:
ethambutol | isoniazid | pyrazinamide | rifampicin | streptomycin | |
---|---|---|---|---|---|
0 | False | False | False | False | True |
1 | False | False | False | False | True |
2 | False | False | False | False | True |
3 | False | False | False | False | True |
4 | False | False | False | False | True |
... | ... | ... | ... | ... | ... |
3569 | False | False | True | False | False |
3570 | False | False | False | False | False |
3571 | False | False | False | False | False |
3572 | False | False | True | False | False |
3573 | False | False | True | False | False |
3574 rows × 5 columns
In [29]:
df_targets_fillna = df_targets_fillna.fillna(0)
df_targets_fillna
Out[29]:
ethambutol | isoniazid | pyrazinamide | rifampicin | streptomycin | |
---|---|---|---|---|---|
0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.0 |
1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... |
3569 | 1.0 | 1.0 | 0.0 | 1.0 | 1.0 |
3570 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
3571 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
3572 | 1.0 | 1.0 | 0.0 | 1.0 | 1.0 |
3573 | 1.0 | 1.0 | 0.0 | 1.0 | 1.0 |
3574 rows × 5 columns
3. Split to train and test sets¶
In [30]:
features_array_fillna = df_features_fillna.to_numpy()
targets_array_fillna = df_targets_fillna.to_numpy()
In [31]:
features_array_fillna, targets_array_fillna
Out[31]:
(array([[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]]), array([[1., 1., 1., 1., 0.], [0., 0., 0., 0., 0.], [0., 0., 0., 0., 0.], ..., [1., 1., 1., 1., 1.], [1., 1., 0., 1., 1.], [1., 1., 0., 1., 1.]]))
In [32]:
(X_train, X_test,
y_train, y_test) = train_test_split(features_array_fillna, targets_array_fillna, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(2501, 41773) (1073, 41773) (2501, 5) (1073, 5)
4.1 Binary Relevance classficiation¶
- Convert Our Multi-Label Prob to Multi-Class
- This is one of the most basic approaches to multi-label classification, it ignores relationships between labels.
In [28]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier
import time
start=time.time()
classifier = BinaryRelevance(
classifier = RandomForestClassifier(),
require_dense = [False, True]
)
classifier.fit(X_train, y_train)
print('training time taken: ',round(time.time()-start,0),'seconds')
training time taken: 24.0 seconds
In [29]:
start=time.time()
y_hat=classifier.predict(X_test)
print('prediction time taken: ',round(time.time()-start,0),'seconds')
prediction time taken: 2.0 seconds
In [30]:
import sklearn.metrics as metrics
br_f1=metrics.f1_score(y_test, y_hat, average='micro')
br_hamm=metrics.hamming_loss(y_test,y_hat)
#br_cover=metrics.coverage_error(y_test,y_hat)
print('Binary Relevance F1-score:',round(br_f1,3))
print('Binary Relevance Hamming Loss:',round(br_hamm,3))
#print('Binary Relevance Hamming Loss:',round(br_cover,3))
#print(metrics.multilabel_confusion_matrix(y_test,y_hat))
#print(metrics.label_ranking_loss(y_test,y_hat))
Binary Relevance F1-score: 0.821 Binary Relevance Hamming Loss: 0.124
In [31]:
y_hat_cv = cross_val_predict(classifier, X_train, y_train, cv=10, n_jobs=-1)
br_f1_cv = f1_score(y_train, y_hat_cv, average='micro')
br_hamm_cv = hamming_loss(y_train, y_hat_cv)
print('Binary Relevance F1-score:',round(br_f1_cv,3))
print('Binary Relevance Hamming Loss:',round(br_hamm_cv,3))
Binary Relevance F1-score: 0.81 Binary Relevance Hamming Loss: 0.127
4.2 Classifier Chains¶
- Preserve Label Correlation
In [32]:
from skmultilearn.problem_transform import ClassifierChain
classifier = ClassifierChain(
classifier = RandomForestClassifier(),
require_dense = [False, True],
order=[i for i in range(5)]
)
start=time.time()
classifier.fit(X_train,y_train)
print('training time taken: ',round(time.time()-start,0),'seconds')
training time taken: 23.0 seconds
In [33]:
start=time.time()
y_hat=classifier.predict(X_test)
print('prediction time taken: ',round(time.time()-start,0),'seconds')
prediction time taken: 1.0 seconds
In [34]:
cc_f1=metrics.f1_score(y_test, y_hat, average='micro')
cc_hamm=metrics.hamming_loss(y_test,y_hat)
print('Classifier Chain F1-score:',round(cc_f1,3))
print('Classifier Chain Hamming Loss:',round(cc_hamm,3))
Classifier Chain F1-score: 0.828 Classifier Chain Hamming Loss: 0.119
In [35]:
y_hat_cv = cross_val_predict(classifier, X_train, y_train, cv=10, n_jobs=-1)
cc_f1_cv = f1_score(y_train, y_hat_cv, average='micro')
cc_hamm_cv = hamming_loss(y_train, y_hat_cv)
print('Classifier Chain F1-score:',round(cc_f1_cv,3))
print('Classifier Chain Hamming Loss:',round(cc_hamm_cv,3))
Classifier Chain F1-score: 0.809 Classifier Chain Hamming Loss: 0.127
4.3 LabelPowerset¶
In [36]:
from skmultilearn.problem_transform import LabelPowerset
classifier = LabelPowerset(
classifier = RandomForestClassifier(),
require_dense = [False, True]
)
start=time.time()
classifier.fit(X_train, y_train)
print('training time taken: ',round(time.time()-start,0),'seconds')
training time taken: 12.0 seconds
In [37]:
start=time.time()
y_hat=classifier.predict(X_test)
print('prediction time taken: ',round(time.time()-start,0),'seconds')
prediction time taken: 1.0 seconds
In [38]:
lp_f1=metrics.f1_score(y_test, y_hat, average='micro')
lp_hamm=metrics.hamming_loss(y_test,y_hat)
print('Label Powerset F1-score:',round(lp_f1,3))
print('Label Powerset Hamming Loss:',round(lp_hamm,3))
Label Powerset F1-score: 0.814 Label Powerset Hamming Loss: 0.127
In [39]:
y_hat_cv = cross_val_predict(classifier, X_train, y_train, cv=10, n_jobs=-1)
lp_f1_cv = f1_score(y_train, y_hat_cv, average='micro')
lp_hamm_cv = hamming_loss(y_train, y_hat_cv)
print('Label Powerset F1-score:',round(lp_f1_cv,3))
print('Label Powerset Hamming Loss:',round(lp_hamm_cv,3))
Label Powerset F1-score: 0.8 Label Powerset Hamming Loss: 0.132
4.4 MLkNN¶
In [ ]:
#### Adapted Algorithm
from skmultilearn.adapt import MLkNN
In [ ]:
print(knn_clf.predict([features_array[2]]))
print(targets_array[2])
4.5 Ensembles of Classifiers¶
4.5.1 LabelSpacePartitioningClassifier¶
In [40]:
from skmultilearn.ensemble import LabelSpacePartitioningClassifier
from skmultilearn.cluster import LabelCooccurrenceGraphBuilder
from skmultilearn.cluster import NetworkXLabelGraphClusterer
graph_builder = LabelCooccurrenceGraphBuilder(weighted=True,
include_self_edges=False)
classifier = LabelSpacePartitioningClassifier(
classifier = BinaryRelevance(
classifier = RandomForestClassifier(),
require_dense = [False, True]
),
clusterer = NetworkXLabelGraphClusterer(graph_builder, method='louvain')
)
start=time.time()
classifier.fit(X_train,y_train)
print('training time taken: ',round(time.time()-start,0),'seconds')
training time taken: 24.0 seconds
In [41]:
start=time.time()
y_hat=classifier.predict(X_test)
print('prediction time taken: ',round(time.time()-start,0),'seconds')
prediction time taken: 1.0 seconds
In [42]:
part_f1=metrics.f1_score(y_test, y_hat, average='micro')
part_hamm=metrics.hamming_loss(y_test,y_hat)
print('Label Space Partitioning Classifier F1-score:',round(part_f1,3))
print('Label Space Partitioning Classifier Hamming Loss:',round(part_hamm,3))
Label Space Partitioning Classifier F1-score: 0.82 Label Space Partitioning Classifier Hamming Loss: 0.124
In [43]:
y_hat_cv = cross_val_predict(classifier, X_train, y_train, cv=10, n_jobs=-1)
part_f1_cv = f1_score(y_train, y_hat_cv, average='micro')
part_hamm_cv = hamming_loss(y_train, y_hat_cv)
print('Label Space Partitioning Classifier F1-score:',round(part_f1_cv,3))
print('Label Space Partitioning Classifier Hamming Loss:',round(part_hamm_cv,3))
Label Space Partitioning Classifier F1-score: 0.809 Label Space Partitioning Classifier Hamming Loss: 0.127
4.5.2 Majority Voting Classifier¶
In [44]:
from skmultilearn.ensemble import MajorityVotingClassifier
classifier = LabelSpacePartitioningClassifier(
classifier = BinaryRelevance(
classifier = RandomForestClassifier(),
require_dense = [False, True]
),
clusterer = NetworkXLabelGraphClusterer(graph_builder, method='louvain')
)
start=time.time()
classifier.fit(X_train,y_train)
print('training time taken: ',round(time.time()-start,0),'seconds')
training time taken: 24.0 seconds
In [45]:
start=time.time()
y_hat=classifier.predict(X_test)
print('prediction time taken: ',round(time.time()-start,0),'seconds')
prediction time taken: 1.0 seconds
In [46]:
majo_f1=metrics.f1_score(y_test, y_hat, average='micro')
majo_hamm=metrics.hamming_loss(y_test,y_hat)
print('Majority Voting Classifier F1-score:',round(majo_f1,3))
print('Majority Voting Classifier Hamming Loss:',round(majo_hamm,3))
Majority Voting Classifier F1-score: 0.822 Majority Voting Classifier Hamming Loss: 0.122
In [47]:
y_hat_cv = cross_val_predict(classifier, X_train, y_train, cv=10, n_jobs=-1)
majo_f1_cv = f1_score(y_train, y_hat_cv, average='micro')
majo_hamm_cv = hamming_loss(y_train, y_hat_cv)
print('Majority Voting Classifier F1-score:',round(majo_f1_cv,3))
print('Majority Voting Classifier Hamming Loss:',round(majo_hamm_cv,3))
Majority Voting Classifier F1-score: 0.811 Majority Voting Classifier Hamming Loss: 0.126
2. Preprocessing 다르게 적용¶
- Target 결측치 치환이 아닌 제거
- 3574 -> 663개
2.1 df 합치기¶
In [33]:
df_targets_drugs = df_targets[['ethambutol', 'isoniazid', 'pyrazinamide', 'rifampicin','streptomycin']]
df_targets_drugs.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3574 entries, 0 to 3573 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ethambutol 3352 non-null float64 1 isoniazid 3501 non-null float64 2 pyrazinamide 2212 non-null float64 3 rifampicin 3547 non-null float64 4 streptomycin 1489 non-null float64 dtypes: float64(5) memory usage: 139.7 KB
In [34]:
df = pd.concat([df_features_fillna, df_targets_drugs], axis=1) # column bind
df.head()
Out[34]:
31 | 64 | 238 | 266 | 326 | 328 | 351 | 365 | 526 | 573 | ... | 4410297 | 4410512 | 4410590 | 4410666 | 4410782 | ethambutol | isoniazid | pyrazinamide | rifampicin | streptomycin | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | NaN |
1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
5 rows × 41778 columns
In [35]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3574 entries, 0 to 3573 Columns: 41778 entries, 31 to streptomycin dtypes: float64(41778) memory usage: 1.1 GB
In [36]:
df.isnull().sum()
Out[36]:
31 0 64 0 238 0 266 0 326 0 ... ethambutol 222 isoniazid 73 pyrazinamide 1362 rifampicin 27 streptomycin 2085 Length: 41778, dtype: int64
2.2 Target 결측치 데이터 제거¶
In [37]:
df_drop = df.dropna()
In [38]:
df_drop.isnull().sum()
Out[38]:
31 0 64 0 238 0 266 0 326 0 .. ethambutol 0 isoniazid 0 pyrazinamide 0 rifampicin 0 streptomycin 0 Length: 41778, dtype: int64
In [39]:
#df_features 생성
df_features = df_drop.iloc[:, :-5]
df_features.head()
Out[39]:
31 | 64 | 238 | 266 | 326 | 328 | 351 | 365 | 526 | 573 | ... | 4409708 | 4409819 | 4409959 | 4409973 | 4410076 | 4410297 | 4410512 | 4410590 | 4410666 | 4410782 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2361 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2362 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2363 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2364 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2365 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 41773 columns
In [40]:
df_features.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 663 entries, 2361 to 3571 Columns: 41773 entries, 31 to 4410782 dtypes: float64(41773) memory usage: 211.3 MB
In [41]:
#df_target 생성
df_targets = df_drop.iloc[:, -5:]
df_targets.head()
Out[41]:
ethambutol | isoniazid | pyrazinamide | rifampicin | streptomycin | |
---|---|---|---|---|---|
2361 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 |
2362 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 |
2363 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 |
2364 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 |
2365 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 |
In [42]:
df_targets.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 663 entries, 2361 to 3571 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ethambutol 663 non-null float64 1 isoniazid 663 non-null float64 2 pyrazinamide 663 non-null float64 3 rifampicin 663 non-null float64 4 streptomycin 663 non-null float64 dtypes: float64(5) memory usage: 31.1 KB
3. Split to train and test sets¶
In [43]:
features_array_del = df_features.to_numpy()
targets_array_del = df_targets.to_numpy()
In [44]:
(X_train, X_test,
y_train, y_test) = train_test_split(features_array_del, targets_array_del, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(464, 41773) (199, 41773) (464, 5) (199, 5)
In [ ]:
4.1 Binary Relevance classficiation¶
- Convert Our Multi-Label Prob to Multi-Class
In [45]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier
import time
start=time.time()
classifier = BinaryRelevance(
classifier = RandomForestClassifier(),
require_dense = [False, True]
)
classifier.fit(X_train, y_train)
print('training time taken: ',round(time.time()-start,0),'seconds')
training time taken: 3.0 seconds
In [46]:
start=time.time()
y_hat=classifier.predict(X_test)
print('prediction time taken: ',round(time.time()-start,0),'seconds')
prediction time taken: 0.0 seconds
In [47]:
import sklearn.metrics as metrics
br_f1=metrics.f1_score(y_test, y_hat, average='micro')
br_hamm=metrics.hamming_loss(y_test,y_hat)
print('Binary Relevance F1-score:',round(br_f1,3))
print('Binary Relevance Hamming Loss:',round(br_hamm,3))
Binary Relevance F1-score: 0.874 Binary Relevance Hamming Loss: 0.131
CV¶
In [61]:
y_hat_cv = cross_val_predict(classifier, X_train, y_train, cv=10, n_jobs=-1)
br_f1_cv = f1_score(y_train, y_hat_cv, average='micro')
br_hamm_cv = hamming_loss(y_train, y_hat_cv)
print('Binary Relevance F1-score:',round(br_f1_cv,3))
print('Binary Relevance Hamming Loss:',round(br_hamm_cv,3))
Binary Relevance F1-score: 0.855 Binary Relevance Hamming Loss: 0.163
In [50]:
classifier.get_params().keys()
Out[50]:
dict_keys(['classifier', 'classifier__bootstrap', 'classifier__ccp_alpha', 'classifier__class_weight', 'classifier__criterion', 'classifier__max_depth', 'classifier__max_features', 'classifier__max_leaf_nodes', 'classifier__max_samples', 'classifier__min_impurity_decrease', 'classifier__min_impurity_split', 'classifier__min_samples_leaf', 'classifier__min_samples_split', 'classifier__min_weight_fraction_leaf', 'classifier__n_estimators', 'classifier__n_jobs', 'classifier__oob_score', 'classifier__random_state', 'classifier__verbose', 'classifier__warm_start', 'require_dense'])
In [65]:
#roc_auc_score(y_train, y_hat_cv, average=None)
In [ ]:
4.2 Classifier Chains¶
- Preserve Label Correlation
In [100]:
from skmultilearn.problem_transform import ClassifierChain
classifier = ClassifierChain(
classifier = RandomForestClassifier(),
require_dense = [False, True],
order=[i for i in range(5)]
)
start=time.time()
classifier.fit(X_train,y_train)
print('training time taken: ',round(time.time()-start,0),'seconds')
training time taken: 3.0 seconds
In [101]:
start=time.time()
y_hat=classifier.predict(X_test)
print('prediction time taken: ',round(time.time()-start,0),'seconds')
prediction time taken: 0.0 seconds
In [102]:
lp_f1=metrics.f1_score(y_test, y_hat, average='micro')
lp_hamm=metrics.hamming_loss(y_test,y_hat)
print('Label Powerset F1-score:',round(lp_f1,3))
print('Label Powerset Hamming Loss:',round(lp_hamm,3))
Label Powerset F1-score: 0.877 Label Powerset Hamming Loss: 0.129
In [103]:
y_hat_cv = cross_val_predict(classifier, X_train, y_train, cv=10, n_jobs=-1)
cc_f1_cv = f1_score(y_train, y_hat_cv, average='micro')
cc_hamm_cv = hamming_loss(y_train, y_hat_cv)
print('Classifier Chain F1-score:',round(cc_f1_cv,3))
print('Classifier Chain Hamming Loss:',round(cc_hamm_cv,3))
Classifier Chain F1-score: 0.858 Classifier Chain Hamming Loss: 0.159
4.3 LabelPowerset¶
In [104]:
from skmultilearn.problem_transform import LabelPowerset
classifier = LabelPowerset(
classifier = RandomForestClassifier(),
require_dense = [False, True]
)
start=time.time()
classifier.fit(X_train, y_train)
print('training time taken: ',round(time.time()-start,0),'seconds')
training time taken: 2.0 seconds
In [105]:
start=time.time()
y_hat=classifier.predict(X_test)
print('prediction time taken: ',round(time.time()-start,0),'seconds')
prediction time taken: 0.0 seconds
In [106]:
cc_f1=metrics.f1_score(y_test, y_hat, average='micro')
cc_hamm=metrics.hamming_loss(y_test,y_hat)
print('Classifier Chain F1-score:',round(cc_f1,3))
print('Classifier Chain Hamming Loss:',round(cc_hamm,3))
Classifier Chain F1-score: 0.863 Classifier Chain Hamming Loss: 0.137
In [107]:
y_hat_cv = cross_val_predict(classifier, X_train, y_train, cv=10, n_jobs=-1)
lp_f1_cv = f1_score(y_train, y_hat_cv, average='micro')
lp_hamm_cv = hamming_loss(y_train, y_hat_cv)
print('Label Powerset F1-score:',round(lp_f1_cv,3))
print('Label Powerset Hamming Loss:',round(lp_hamm_cv,3))
Label Powerset F1-score: 0.845 Label Powerset Hamming Loss: 0.171
4.4 Algorithm Adaptation¶
4.4.1 MLkNN¶
In [ ]:
from skmultilearn.adapt import MLkNN
from sklearn.model_selection import GridSearchCV
import time
parameters = {'k': range(2,11), 's': [0.5, 0.7, 1.0]}
score = 'f1_micro'
start=time.time()
classifier = GridSearchCV(MLkNN(), parameters, scoring=score)
classifier.fit(X_train, y_train)
print('training time taken: ',round(time.time()-start,0),'seconds')
print('best parameters :', classifier.best_params_, 'best score: ',
classifier.best_score_)
In [ ]:
print('best parameters :', classifier.best_params_,
'best score: ',classifier.best_score_)
4.4.2 BRkNNaClassifier¶
In [ ]:
from skmultilearn.adapt import BRkNNaClassifier
parameters = {'k': range(3,10)}
score = 'f1_micro'
start=time.time()
classifier = GridSearchCV(BRkNNaClassifier(), parameters, scoring=score)
classifier.fit(X_train, y_train)
print('training time taken: ',round(time.time()-start,0),'seconds')
print('best parameters :', classifier.best_params_,
'best score: ',classifier.best_score_)
4.5 Ensembles of Classifiers¶
4.5.1 LabelSpacePartitioningClassifier¶
In [108]:
from skmultilearn.ensemble import LabelSpacePartitioningClassifier
from skmultilearn.cluster import LabelCooccurrenceGraphBuilder
from skmultilearn.cluster import NetworkXLabelGraphClusterer
graph_builder = LabelCooccurrenceGraphBuilder(weighted=True,
include_self_edges=False)
classifier = LabelSpacePartitioningClassifier(
classifier = BinaryRelevance(
classifier = RandomForestClassifier(),
require_dense = [False, True]
),
clusterer = NetworkXLabelGraphClusterer(graph_builder, method='louvain')
)
start=time.time()
classifier.fit(X_train,y_train)
print('training time taken: ',round(time.time()-start,0),'seconds')
training time taken: 3.0 seconds
In [109]:
start=time.time()
y_hat=classifier.predict(X_test)
print('prediction time taken: ',round(time.time()-start,0),'seconds')
prediction time taken: 0.0 seconds
In [110]:
part_f1=metrics.f1_score(y_test, y_hat, average='micro')
part_hamm=metrics.hamming_loss(y_test,y_hat)
print('Label Space Partitioning Classifier F1-score:',round(part_f1,3))
print('Label Space Partitioning Classifier Hamming Loss:',round(part_hamm,3))
Label Space Partitioning Classifier F1-score: 0.877 Label Space Partitioning Classifier Hamming Loss: 0.128
In [111]:
y_hat_cv = cross_val_predict(classifier, X_train, y_train, cv=10, n_jobs=-1)
part_f1_cv = f1_score(y_train, y_hat_cv, average='micro')
part_hamm_cv = hamming_loss(y_train, y_hat_cv)
print('Label Space Partitioning Classifier F1-score:',round(part_f1_cv,3))
print('Label Space Partitioning Classifier Hamming Loss:',round(part_hamm_cv,3))
Label Space Partitioning Classifier F1-score: 0.851 Label Space Partitioning Classifier Hamming Loss: 0.167
4.5.2 Majority Voting Classifier¶
In [112]:
from skmultilearn.ensemble import MajorityVotingClassifier
classifier = LabelSpacePartitioningClassifier(
classifier = BinaryRelevance(
classifier = RandomForestClassifier(),
require_dense = [False, True]
),
clusterer = NetworkXLabelGraphClusterer(graph_builder, method='louvain')
)
start=time.time()
classifier.fit(X_train,y_train)
print('training time taken: ',round(time.time()-start,0),'seconds')
training time taken: 3.0 seconds
In [113]:
start=time.time()
y_hat=classifier.predict(X_test)
print('prediction time taken: ',round(time.time()-start,0),'seconds')
prediction time taken: 0.0 seconds
In [114]:
majo_f1=metrics.f1_score(y_test, y_hat, average='micro')
majo_hamm=metrics.hamming_loss(y_test,y_hat)
print('Majority Voting Classifier F1-score:',round(majo_f1,3))
print('Majority Voting Classifier Hamming Loss:',round(majo_hamm,3))
Majority Voting Classifier F1-score: 0.872 Majority Voting Classifier Hamming Loss: 0.132
In [115]:
y_hat_cv = cross_val_predict(classifier, X_train, y_train, cv=10, n_jobs=-1)
majo_f1_cv = f1_score(y_train, y_hat_cv, average='micro')
majo_hamm_cv = hamming_loss(y_train, y_hat_cv)
print('Majority Voting Classifier F1-score:',round(majo_f1_cv,3))
print('Majority Voting Classifier Hamming Loss:',round(majo_hamm_cv,3))
Majority Voting Classifier F1-score: 0.853 Majority Voting Classifier Hamming Loss: 0.164
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
2. Data preprocessing¶
2.1 데이터 치환 + sparse 처리(CSR)¶
In [66]:
from scipy import sparse
In [67]:
features_fillna_csr = sparse.csr_matrix(features_array_fillna)
targets_fillna_csr = sparse.csr_matrix(targets_array_fillna)
In [78]:
#print(features_fillna_csr)
3. Split to train and test sets¶
In [69]:
(X_train, X_test,
y_train, y_test) = train_test_split(features_fillna_csr, targets_fillna_csr, test_size=0.25, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(2680, 41773) (894, 41773) (2680, 5) (894, 5)
4.1 Binary Relevance classficiation¶
- Convert Our Multi-Label Prob to Multi-Class
In [70]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier
import time
start=time.time()
classifier = BinaryRelevance(
classifier = RandomForestClassifier(),
require_dense = [False, True]
)
classifier.fit(X_train, y_train)
print('training time taken: ',round(time.time()-start,0),'seconds')
start=time.time()
y_hat=classifier.predict(X_test)
print('prediction time taken: ',round(time.time()-start,0),'seconds')
import sklearn.metrics as metrics
br_f1=metrics.f1_score(y_test, y_hat, average='micro')
br_hamm=metrics.hamming_loss(y_test,y_hat)
print('Binary Relevance F1-score:',round(br_f1,3))
print('Binary Relevance Hamming Loss:',round(br_hamm,3))
training time taken: 26.0 seconds prediction time taken: 1.0 seconds Binary Relevance F1-score: 0.825 Binary Relevance Hamming Loss: 0.12
4.2 Classifier Chains¶
- Preserve Label Correlation
In [71]:
from skmultilearn.problem_transform import ClassifierChain
classifier = ClassifierChain(
classifier = RandomForestClassifier(),
require_dense = [False, True],
order=[i for i in range(5)]
)
start=time.time()
classifier.fit(X_train,y_train)
print('training time taken: ',round(time.time()-start,0),'seconds')
start=time.time()
y_hat=classifier.predict(X_test)
print('prediction time taken: ',round(time.time()-start,0),'seconds')
cc_f1=metrics.f1_score(y_test, y_hat, average='micro')
cc_hamm=metrics.hamming_loss(y_test,y_hat)
print('Classifier Chain F1-score:',round(cc_f1,3))
print('Classifier Chain Hamming Loss:',round(cc_hamm,3))
training time taken: 24.0 seconds prediction time taken: 1.0 seconds Classifier Chain F1-score: 0.815 Classifier Chain Hamming Loss: 0.128
4.3 LabelPowerset¶
In [72]:
from skmultilearn.problem_transform import LabelPowerset
classifier = LabelPowerset(
classifier = RandomForestClassifier(),
require_dense = [False, True]
)
start=time.time()
classifier.fit(X_train, y_train)
print('training time taken: ',round(time.time()-start,0),'seconds')
start=time.time()
y_hat=classifier.predict(X_test)
print('prediction time taken: ',round(time.time()-start,0),'seconds')
lp_f1=metrics.f1_score(y_test, y_hat, average='micro')
lp_hamm=metrics.hamming_loss(y_test,y_hat)
print('Label Powerset F1-score:',round(lp_f1,3))
print('Label Powerset Hamming Loss:',round(lp_hamm,3))
training time taken: 13.0 seconds prediction time taken: 0.0 seconds Label Powerset F1-score: 0.806 Label Powerset Hamming Loss: 0.133
4.4 MLkNN¶
In [ ]:
classifier = MLkNN(k=5)
# train
classifier.fit(X_train, y_train)
# predict
predictions = classifier.predict(X_test)
acc = accuracy_score(y_test,predictions)
print(acc)
2.2 데이터 삭제 + sparse 처리(CSR)¶
In [73]:
features_del_csr = sparse.csr_matrix(features_array_del)
targets_del_csr = sparse.csr_matrix(targets_array_del)
In [ ]:
#print(features_del_csr)
3. Split to train and test sets¶
In [74]:
(X_train, X_test,
y_train, y_test) = train_test_split(features_del_csr, targets_del_csr, test_size=0.1, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(596, 41773) (67, 41773) (596, 5) (67, 5)
4.1 Binary Relevance classficiation¶
- Convert Our Multi-Label Prob to Multi-Class
In [75]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier
import time
start=time.time()
classifier = BinaryRelevance(
classifier = RandomForestClassifier(),
require_dense = [False, True]
)
classifier.fit(X_train, y_train)
print('training time taken: ',round(time.time()-start,0),'seconds')
start=time.time()
y_hat=classifier.predict(X_test)
print('prediction time taken: ',round(time.time()-start,0),'seconds')
import sklearn.metrics as metrics
br_f1=metrics.f1_score(y_test, y_hat, average='micro')
br_hamm=metrics.hamming_loss(y_test,y_hat)
print('Binary Relevance F1-score:',round(br_f1,3))
print('Binary Relevance Hamming Loss:',round(br_hamm,3))
training time taken: 4.0 seconds prediction time taken: 0.0 seconds Binary Relevance F1-score: 0.857 Binary Relevance Hamming Loss: 0.134
4.2 Classifier Chains¶
- Preserve Label Correlation
In [76]:
from skmultilearn.problem_transform import ClassifierChain
classifier = ClassifierChain(
classifier = RandomForestClassifier(),
require_dense = [False, True],
order=[i for i in range(5)]
)
start=time.time()
classifier.fit(X_train,y_train)
print('training time taken: ',round(time.time()-start,0),'seconds')
start=time.time()
y_hat=classifier.predict(X_test)
print('prediction time taken: ',round(time.time()-start,0),'seconds')
cc_f1=metrics.f1_score(y_test, y_hat, average='micro')
cc_hamm=metrics.hamming_loss(y_test,y_hat)
print('Classifier Chain F1-score:',round(cc_f1,3))
print('Classifier Chain Hamming Loss:',round(cc_hamm,3))
training time taken: 4.0 seconds prediction time taken: 0.0 seconds Classifier Chain F1-score: 0.854 Classifier Chain Hamming Loss: 0.137
In [77]:
from skmultilearn.problem_transform import LabelPowerset
classifier = LabelPowerset(
classifier = RandomForestClassifier(),
require_dense = [False, True]
)
start=time.time()
classifier.fit(X_train, y_train)
print('training time taken: ',round(time.time()-start,0),'seconds')
start=time.time()
y_hat=classifier.predict(X_test)
print('prediction time taken: ',round(time.time()-start,0),'seconds')
lp_f1=metrics.f1_score(y_test, y_hat, average='micro')
lp_hamm=metrics.hamming_loss(y_test,y_hat)
print('Label Powerset F1-score:',round(lp_f1,3))
print('Label Powerset Hamming Loss:',round(lp_hamm,3))
training time taken: 2.0 seconds prediction time taken: 0.0 seconds Label Powerset F1-score: 0.85 Label Powerset Hamming Loss: 0.137
4.4 MLkNN¶
In [ ]:
classifier = MLkNN(k=5)
# train
classifier.fit(X_train, y_train)
# predict
predictions = classifier.predict(X_test)
acc = accuracy_score(y_test,predictions)
print(acc)