Classification Tree Illustration¶

In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

# Set seed for reproducibility
np.random.seed(42)

# No. samples for synthetic dataset
n_samples = 100

Create a synthetic dataset¶

In [3]:
# Create features
age = np.random.randint(18, 65, size=n_samples)
education = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], size=n_samples)
tech_savvy = np.random.choice(['Low', 'Medium', 'High'], size=n_samples)
profession = np.random.choice(['Student', 'Engineer', 'Teacher', 'Artist', 'Manager'], size=n_samples)
internet_use = np.round(np.random.normal(loc=5, scale=2, size=n_samples), 1)  # daily hours

# Simulate target variable (ChatGPT usage)
# Higher tech_savvy and internet_use correlate with usage
uses_chatgpt = []
for i in range(n_samples):
    score = 0
    if tech_savvy[i] == 'High': score += 2
    if internet_use[i] > 4: score += 1
    if profession[i] in ['Student', 'Engineer']: score += 1
    if education[i] in ['Master', 'PhD']: score += 1
    uses_chatgpt.append(1 if score >= 3 else 0)

# Create DataFrame
df_orig = pd.DataFrame({
    'Age': age,
    'Education': education,
    'Tech_Savvy': tech_savvy,
    'Profession': profession,
    'Internet_Use_Hrs': internet_use,
    'Uses_ChatGPT': uses_chatgpt
})

df_orig.head()
Out[3]:
Age Education Tech_Savvy Profession Internet_Use_Hrs Uses_ChatGPT
0 56 PhD Low Manager 3.9 0
1 46 PhD High Teacher 4.7 1
2 32 Master Medium Student 1.9 0
3 60 Master Low Student 9.0 1
4 25 Master Low Teacher 2.6 0
In [4]:
# Encode categorical variables
df = pd.get_dummies(df_orig.drop(columns='Uses_ChatGPT'), drop_first=True)

df.head()
Out[4]:
Age Internet_Use_Hrs Education_High School Education_Master Education_PhD Tech_Savvy_Low Tech_Savvy_Medium Profession_Engineer Profession_Manager Profession_Student Profession_Teacher
0 56 3.9 False False True True False False True False False
1 46 4.7 False False True False False False False False True
2 32 1.9 False True False False True False False True False
3 60 9.0 False True False True False False False True False
4 25 2.6 False True False True False False False False True

Train model¶

In [5]:
# Prepare features and target
X = df
y = df_orig['Uses_ChatGPT']

# Train a simple decision tree
clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf.fit(X, y)
Out[5]:
DecisionTreeClassifier(max_depth=3, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(max_depth=3, random_state=42)

Visualize Tree¶

In [6]:
# Visualize the tree
plt.figure(figsize=(16, 8))
plot_tree(clf,
          feature_names=X.columns,
          class_names=['No', 'Yes'],
          filled=True,
          rounded=True)
plt.title("Classification Tree: Predicting ChatGPT Usage")
plt.show()
No description has been provided for this image

Make single prediction¶

In [8]:
# Example: Predict ChatGPT usage for a new individual
new_person = pd.DataFrame([{
    'Age': 23,
    'Education': 'Master',
    'Tech_Savvy': 'High',
    'Profession': 'Student',
    'Internet_Use_Hrs': 6.5
}])

# Encode using the same dummy variables (align with training data)
new_encoded = pd.get_dummies(new_person)

# Make sure all expected columns are present (fill missing with 0)
new_encoded = new_encoded.reindex(columns=X.columns, fill_value=0)

# Make the prediction
prediction = clf.predict(new_encoded)[0]

# Output the result
print(f"Prediction:\n {'Yes' if prediction == 1 else 'No'}, this person will{'' if prediction == 1 else ' not'} use ChatGPT")
Prediction:
 No, this person will not use ChatGPT