Classification Tree Illustration¶
In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
# Set seed for reproducibility
np.random.seed(42)
# No. samples for synthetic dataset
n_samples = 100
Create a synthetic dataset¶
In [3]:
# Create features
age = np.random.randint(18, 65, size=n_samples)
education = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], size=n_samples)
tech_savvy = np.random.choice(['Low', 'Medium', 'High'], size=n_samples)
profession = np.random.choice(['Student', 'Engineer', 'Teacher', 'Artist', 'Manager'], size=n_samples)
internet_use = np.round(np.random.normal(loc=5, scale=2, size=n_samples), 1) # daily hours
# Simulate target variable (ChatGPT usage)
# Higher tech_savvy and internet_use correlate with usage
uses_chatgpt = []
for i in range(n_samples):
score = 0
if tech_savvy[i] == 'High': score += 2
if internet_use[i] > 4: score += 1
if profession[i] in ['Student', 'Engineer']: score += 1
if education[i] in ['Master', 'PhD']: score += 1
uses_chatgpt.append(1 if score >= 3 else 0)
# Create DataFrame
df_orig = pd.DataFrame({
'Age': age,
'Education': education,
'Tech_Savvy': tech_savvy,
'Profession': profession,
'Internet_Use_Hrs': internet_use,
'Uses_ChatGPT': uses_chatgpt
})
df_orig.head()
Out[3]:
Age | Education | Tech_Savvy | Profession | Internet_Use_Hrs | Uses_ChatGPT | |
---|---|---|---|---|---|---|
0 | 56 | PhD | Low | Manager | 3.9 | 0 |
1 | 46 | PhD | High | Teacher | 4.7 | 1 |
2 | 32 | Master | Medium | Student | 1.9 | 0 |
3 | 60 | Master | Low | Student | 9.0 | 1 |
4 | 25 | Master | Low | Teacher | 2.6 | 0 |
In [4]:
# Encode categorical variables
df = pd.get_dummies(df_orig.drop(columns='Uses_ChatGPT'), drop_first=True)
df.head()
Out[4]:
Age | Internet_Use_Hrs | Education_High School | Education_Master | Education_PhD | Tech_Savvy_Low | Tech_Savvy_Medium | Profession_Engineer | Profession_Manager | Profession_Student | Profession_Teacher | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 56 | 3.9 | False | False | True | True | False | False | True | False | False |
1 | 46 | 4.7 | False | False | True | False | False | False | False | False | True |
2 | 32 | 1.9 | False | True | False | False | True | False | False | True | False |
3 | 60 | 9.0 | False | True | False | True | False | False | False | True | False |
4 | 25 | 2.6 | False | True | False | True | False | False | False | False | True |
Train model¶
In [5]:
# Prepare features and target
X = df
y = df_orig['Uses_ChatGPT']
# Train a simple decision tree
clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf.fit(X, y)
Out[5]:
DecisionTreeClassifier(max_depth=3, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(max_depth=3, random_state=42)
Visualize Tree¶
In [6]:
# Visualize the tree
plt.figure(figsize=(16, 8))
plot_tree(clf,
feature_names=X.columns,
class_names=['No', 'Yes'],
filled=True,
rounded=True)
plt.title("Classification Tree: Predicting ChatGPT Usage")
plt.show()
Make single prediction¶
In [8]:
# Example: Predict ChatGPT usage for a new individual
new_person = pd.DataFrame([{
'Age': 23,
'Education': 'Master',
'Tech_Savvy': 'High',
'Profession': 'Student',
'Internet_Use_Hrs': 6.5
}])
# Encode using the same dummy variables (align with training data)
new_encoded = pd.get_dummies(new_person)
# Make sure all expected columns are present (fill missing with 0)
new_encoded = new_encoded.reindex(columns=X.columns, fill_value=0)
# Make the prediction
prediction = clf.predict(new_encoded)[0]
# Output the result
print(f"Prediction:\n {'Yes' if prediction == 1 else 'No'}, this person will{'' if prediction == 1 else ' not'} use ChatGPT")
Prediction: No, this person will not use ChatGPT