Cada bloque de código tiene su explicación y su gráfico correspondiente
# 1. IMPORTAR LIBRERÍAS
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
# 2. DESCARGAR Y CARGAR LOS DATOS
# !wget -O drug200.csv https://s3-api.../drug200.csv
my_data = pd.read_csv("drug200.csv", delimiter=",")
print("Primeras 5 filas:")
print(my_data[0:5])
print(f"Tamaño: {my_data.shape}")
| Age | Sex | BP | Chol | Na/K | Drug |
|---|
# Análisis exploratorio
print("Distribución de medicamentos:")
print(my_data['Drug'].value_counts())
print("\nDistribución por sexo:")
print(my_data['Sex'].value_counts())
print("\nNiveles de colesterol:")
print(my_data['Cholesterol'].value_counts())
print("\nPresión arterial:")
print(my_data['BP'].value_counts())
Medicamentos
Sexo
Colesterol
Presión Arterial
# X = matriz de características (lo que usamos para predecir)
X = my_data[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values
print(f"Forma de X: {X.shape}") # (200, 5)
print(X[0:3]) # Primeros 3 pacientes
# Codificar Sexo: F=0, M=1
le_sex = preprocessing.LabelEncoder()
le_sex.fit(['F', 'M'])
X[:, 1] = le_sex.transform(X[:, 1])
# Codificar BP: LOW=0, NORMAL=1, HIGH=2
le_BP = preprocessing.LabelEncoder()
le_BP.fit(['LOW', 'NORMAL', 'HIGH'])
X[:, 2] = le_BP.transform(X[:, 2])
# Codificar Cholesterol: NORMAL=0, HIGH=1
le_Chol = preprocessing.LabelEncoder()
le_Chol.fit(['NORMAL', 'HIGH'])
X[:, 3] = le_Chol.transform(X[:, 3])
# y = vector objetivo (lo que el árbol debe aprender a predecir)
y = my_data["Drug"]
print(f"Forma de y: {y.shape}") # (200,)
print(y[0:5]) # Primeros 5 medicamentos
from sklearn.model_selection import train_test_split
X_trainset, X_testset, y_trainset, y_testset = train_test_split(
X, y, test_size=0.3, random_state=3
)
print(f"X_trainset shape: {X_trainset.shape}") # (140, 5)
print(f"X_testset shape: {X_testset.shape}") # (60, 5)
print(f"y_trainset shape: {y_trainset.shape}") # (140,)
print(f"y_testset shape: {y_testset.shape}") # (60,)