import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
import scipy.cluster.hierarchy as shc
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering


df = pd.read_csv('./data/Train.csv', index_col='ID')
df.head()


df.columns

Index(['Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession',
       'Work_Experience', 'Spending_Score', 'Family_Size', 'Var_1',
       'Segmentation'],
      dtype='object')


df_select = df.drop(['Var_1','Segmentation'], axis=1)
df_select.head()


df_select.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8068 entries, 462809 to 461879
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Gender           8068 non-null   object 
 1   Ever_Married     7928 non-null   object 
 2   Age              8068 non-null   int64  
 3   Graduated        7990 non-null   object 
 4   Profession       7944 non-null   object 
 5   Work_Experience  7239 non-null   float64
 6   Spending_Score   8068 non-null   object 
 7   Family_Size      7733 non-null   float64
dtypes: float64(2), int64(1), object(5)
memory usage: 567.3+ KB


pd.DataFrame({'missing':df_select.isnull().sum(), 
              'percentage':(df_select.isnull().sum() / np.shape(df_select)[0]) * 100})


df_select = df_select.dropna()
df_select.isnull().sum()

Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
dtype: int64


# select them easily
cat_cols = df_select.select_dtypes(exclude=["number"]).columns
cat_cols

Index(['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score'], dtype='object')


# onehotencode the categorical features
encoder = preprocessing.OneHotEncoder(sparse_output=False)
trans = encoder.fit_transform(df_select[cat_cols])

# obtain the encoded column names
enc_columns = encoder.get_feature_names_out(cat_cols)
    
# converse the encoded feature array into DataFrame 
features_enc = pd.DataFrame(trans, columns=enc_columns)

# add the encoded features on the selected feature DataFrame
df_enc = pd.concat([df_select,features_enc.set_index(df_select.index)],axis=1)

# drop the original unencoded categorical columns 
df_enc.drop(cat_cols,axis=1, inplace=True)
df_enc.head()


X = df_enc.values
X

array([[22.,  1.,  4., ...,  0.,  0.,  1.],
       [67.,  1.,  1., ...,  0.,  0.,  1.],
       [67.,  0.,  2., ...,  0.,  1.,  0.],
       ...,
       [33.,  1.,  1., ...,  0.,  0.,  1.],
       [27.,  1.,  4., ...,  0.,  0.,  1.],
       [37.,  0.,  3., ...,  1.,  0.,  0.]])


linkage_matrix = shc.linkage(X, 'ward')


plt.figure(figsize=(10, 7))
shc.dendrogram(linkage_matrix)
plt.title("Customers Dendrogram")

plt.axhline(y=1000, color='r', linestyle="--")
plt.axhline(y=600, color='m', linestyle="--")
plt.axhline(y=400, color='y', linestyle="--")

plt.show()


model = AgglomerativeClustering(n_clusters=3, linkage='ward')


model.fit(X)
labels = model.labels_
labels

array([0, 2, 2, ..., 0, 0, 0], dtype=int64)


score=[]

range_n_clusters = range(2, 11)

for num_clusters in range_n_clusters:
    
    # intialise Hierarchical Clustering
    hcluster = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
    hcluster.fit(X)
    
    cluster_labels = hcluster.labels_
    
    # silhouette score
    silhouette_avg = silhouette_score(X, cluster_labels)
    score.append(silhouette_avg)
    
    print("For n_clusters={0}, the silhouette score is {1:.2f}".format(num_clusters, silhouette_avg))

For n_clusters=2, the silhouette score is 0.53
For n_clusters=3, the silhouette score is 0.48
For n_clusters=4, the silhouette score is 0.40
For n_clusters=5, the silhouette score is 0.39
For n_clusters=6, the silhouette score is 0.36
For n_clusters=7, the silhouette score is 0.36
For n_clusters=8, the silhouette score is 0.37
For n_clusters=9, the silhouette score is 0.38
For n_clusters=10, the silhouette score is 0.38


results = df_enc.copy()
results['Clusters'] = labels
results.head()


plt.figure(figsize=(15,8))
pd.plotting.parallel_coordinates(results,'Clusters',alpha=0.90,sort_labels=True)

plt.xticks(rotation=80)
plt.show()


results_drop_age = results.drop(['Age'],axis=1)

plt.figure(figsize=(15,8))
pd.plotting.parallel_coordinates(results_drop_age,'Clusters',alpha=0.90,sort_labels=True)

plt.xticks(rotation=80)
plt.show()


# numerical column names
num_cols = results.iloc[:,0:3].columns

# encoded categorical column names
cat_cols = results.iloc[:,3:-1].columns

# means of original numerical column items by clusters
results_mean = results.groupby(['Clusters'])[[num_cols][0]].mean()

# Count total numbers of column items by clusters
results_count = results.groupby('Clusters')[[cat_cols][0]].apply(lambda x: (x==1).sum())

# combine the results into one DateFrame
results_cluster = pd.concat([results_mean,results_count],axis=1)
results_count


# change index of 'Cluster to column'
results_cluster.reset_index(inplace=True)

results_cluster


plt.figure(figsize=(15,8))
pd.plotting.parallel_coordinates(results_cluster,'Clusters',alpha=0.9,sort_labels=True)

plt.xticks(rotation=80)
plt.show()

	Gender	Ever_Married	Age	Graduated	Profession	Work_Experience	Spending_Score	Family_Size	Var_1	Segmentation
ID
462809	Male	No	22	No	Healthcare	1.0	Low	4.0	Cat_4	D
462643	Female	Yes	38	Yes	Engineer	NaN	Average	3.0	Cat_4	A
466315	Female	Yes	67	Yes	Engineer	1.0	Low	1.0	Cat_6	B
461735	Male	Yes	67	Yes	Lawyer	0.0	High	2.0	Cat_6	B
462669	Female	Yes	40	Yes	Entertainment	NaN	High	6.0	Cat_6	A

	Gender	Ever_Married	Age	Graduated	Profession	Work_Experience	Spending_Score	Family_Size
ID
462809	Male	No	22	No	Healthcare	1.0	Low	4.0
462643	Female	Yes	38	Yes	Engineer	NaN	Average	3.0
466315	Female	Yes	67	Yes	Engineer	1.0	Low	1.0
461735	Male	Yes	67	Yes	Lawyer	0.0	High	2.0
462669	Female	Yes	40	Yes	Entertainment	NaN	High	6.0

	missing	percentage
Gender	0	0.000000
Ever_Married	140	1.735250
Age	0	0.000000
Graduated	78	0.966782
Profession	124	1.536936
Work_Experience	829	10.275161
Spending_Score	0	0.000000
Family_Size	335	4.152206

	Age	Work_Experience	Family_Size	Gender_Female	Gender_Male	Ever_Married_No	Ever_Married_Yes	Graduated_No	Graduated_Yes	Profession_Artist	...	Profession_Engineer	Profession_Entertainment	Profession_Executive	Profession_Healthcare	Profession_Homemaker	Profession_Lawyer	Profession_Marketing	Spending_Score_Average	Spending_Score_High	Spending_Score_Low
ID
462809	22	1.0	4.0	0.0	1.0	1.0	0.0	1.0	0.0	0.0	...	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0
466315	67	1.0	1.0	1.0	0.0	0.0	1.0	0.0	1.0	0.0	...	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
461735	67	0.0	2.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	...	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0
461319	56	0.0	2.0	0.0	1.0	0.0	1.0	1.0	0.0	1.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
460156	32	1.0	3.0	0.0	1.0	1.0	0.0	0.0	1.0	0.0	...	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0

	Age	Work_Experience	Family_Size	Gender_Female	Gender_Male	Ever_Married_No	Ever_Married_Yes	Graduated_No	Graduated_Yes	Profession_Artist	...	Profession_Entertainment	Profession_Executive	Profession_Healthcare	Profession_Homemaker	Profession_Lawyer	Profession_Marketing	Spending_Score_Average	Spending_Score_High	Spending_Score_Low	Clusters
ID
462809	22	1.0	4.0	0.0	1.0	1.0	0.0	1.0	0.0	0.0	...	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0
466315	67	1.0	1.0	1.0	0.0	0.0	1.0	0.0	1.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	2
461735	67	0.0	2.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	...	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	2
461319	56	0.0	2.0	0.0	1.0	0.0	1.0	1.0	0.0	1.0	...	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1
460156	32	1.0	3.0	0.0	1.0	1.0	0.0	0.0	1.0	0.0	...	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0

Table of Contents

1. Import the Required Libraries and Read the Data¶

1.1 Import the Required Libraries¶

1.2 Read the Data¶

2. Data Preprocessing¶

2.1 Select variables/features¶

2.2 Data exploration¶

2.3 Missing values¶

2.4 Remove the missing values¶

2.5 Encode categorical variables¶

2.6 Converse the dateframe into an array¶

3. Visualizing Hierarchical Structure with Dendrogram¶

3.1 What is dendrogram¶

3.2 Process of creating dendrogram¶

4. Implementing Hierarchical Clustering with Scikit-Learn¶

4.1 Define the model¶

4.2 Fit the model¶

4.3 Evaluate the clustering results¶

5. Display the Results¶

5.1 Add the Clustering labels in the DataFrame¶

5.2 Parallel coordinates plot of the results¶

5.3 Summary statistics of the results¶

5.4 Parallel coordinates plot of the summarized results¶

Conclusion¶

	Gender_Female	Gender_Male	Ever_Married_No	Ever_Married_Yes	Graduated_No	Graduated_Yes	Profession_Artist	Profession_Doctor	Profession_Engineer	Profession_Entertainment	Profession_Executive	Profession_Healthcare	Profession_Homemaker	Profession_Lawyer	Profession_Marketing	Spending_Score_Average	Spending_Score_High	Spending_Score_Low
Clusters
0	1819	2075	2437	1457	1740	2154	1029	450	370	493	180	1059	133	8	172	718	252	2924
1	873	1195	272	1796	421	1647	1047	124	192	278	234	28	40	75	50	875	391	802
2	324	432	34	722	279	477	135	20	24	44	95	2	5	420	11	84	369	303

	Clusters	Age	Work_Experience	Family_Size	Gender_Female	Gender_Male	Ever_Married_No	Ever_Married_Yes	Graduated_No	Graduated_Yes	...	Profession_Engineer	Profession_Entertainment	Profession_Executive	Profession_Healthcare	Profession_Homemaker	Profession_Lawyer	Profession_Marketing	Spending_Score_Average	Spending_Score_High	Spending_Score_Low
0	0	32.002825	3.254237	3.048279	1819	2075	2437	1457	1740	2154	...	370	493	180	1059	133	8	172	718	252	2924
1	1	53.324468	1.977273	2.754836	873	1195	272	1796	421	1647	...	192	278	234	28	40	75	50	875	391	802
2	2	76.060847	1.197090	2.015873	324	432	34	722	279	477	...	24	44	95	2	5	420	11	84	369	303