# Import libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

# Set seed for reproducibility
np.random.seed(42)

# No. samples for synthetic dataset
n_samples = 100

# Create features
age = np.random.randint(18, 65, size=n_samples)
education = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], size=n_samples)
tech_savvy = np.random.choice(['Low', 'Medium', 'High'], size=n_samples)
profession = np.random.choice(['Student', 'Engineer', 'Teacher', 'Artist', 'Manager'], size=n_samples)
internet_use = np.round(np.random.normal(loc=5, scale=2, size=n_samples), 1)  # daily hours

# Simulate target variable (ChatGPT usage)
# Higher tech_savvy and internet_use correlate with usage
uses_chatgpt = []
for i in range(n_samples):
    score = 0
    if tech_savvy[i] == 'High': score += 2
    if internet_use[i] > 4: score += 1
    if profession[i] in ['Student', 'Engineer']: score += 1
    if education[i] in ['Master', 'PhD']: score += 1
    uses_chatgpt.append(1 if score >= 3 else 0)

# Create DataFrame
df_orig = pd.DataFrame({
    'Age': age,
    'Education': education,
    'Tech_Savvy': tech_savvy,
    'Profession': profession,
    'Internet_Use_Hrs': internet_use,
    'Uses_ChatGPT': uses_chatgpt
})

df_orig.head()

# Encode categorical variables
df = pd.get_dummies(df_orig.drop(columns='Uses_ChatGPT'), drop_first=True)

df.head()

# Prepare features and target
X = df
y = df_orig['Uses_ChatGPT']

# Train a simple decision tree
clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf.fit(X, y)

DecisionTreeClassifier(max_depth=3, random_state=42)

DecisionTreeClassifier(max_depth=3, random_state=42)

# Visualize the tree
plt.figure(figsize=(16, 8))
plot_tree(clf,
          feature_names=X.columns,
          class_names=['No', 'Yes'],
          filled=True,
          rounded=True)
plt.title("Classification Tree: Predicting ChatGPT Usage")
plt.show()

# Example: Predict ChatGPT usage for a new individual
new_person = pd.DataFrame([{
    'Age': 23,
    'Education': 'Master',
    'Tech_Savvy': 'High',
    'Profession': 'Student',
    'Internet_Use_Hrs': 6.5
}])

# Encode using the same dummy variables (align with training data)
new_encoded = pd.get_dummies(new_person)

# Make sure all expected columns are present (fill missing with 0)
new_encoded = new_encoded.reindex(columns=X.columns, fill_value=0)

# Make the prediction
prediction = clf.predict(new_encoded)[0]

# Output the result
print(f"Prediction:\n {'Yes' if prediction == 1 else 'No'}, this person will{'' if prediction == 1 else ' not'} use ChatGPT")

Prediction:
 No, this person will not use ChatGPT

	Age	Internet_Use_Hrs	Education_High School	Education_Master	Education_PhD	Tech_Savvy_Low	Tech_Savvy_Medium	Profession_Engineer	Profession_Manager	Profession_Student	Profession_Teacher
0	56	3.9	False	False	True	True	False	False	True	False	False
1	46	4.7	False	False	True	False	False	False	False	False	True
2	32	1.9	False	True	False	False	True	False	False	True	False
3	60	9.0	False	True	False	True	False	False	False	True	False
4	25	2.6	False	True	False	True	False	False	False	False	True

Classification Tree Illustration¶

Create a synthetic dataset¶

Train model¶

Visualize Tree¶

Make single prediction¶

	Age	Education	Tech_Savvy	Profession	Internet_Use_Hrs	Uses_ChatGPT
0	56	PhD	Low	Manager	3.9	0
1	46	PhD	High	Teacher	4.7	1
2	32	Master	Medium	Student	1.9	0
3	60	Master	Low	Student	9.0	1
4	25	Master	Low	Teacher	2.6	0