pip install fuzzy-c-means


conda install fuzzy-c-means


import pandas as pd

# Load the Mall customer data
url = 'https://raw.githubusercontent.com/ITE-5th/fuzzy-clustering/master/data/crime_data.csv'

data = pd.read_csv(url)
data.head()


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0                   50 non-null     object 
 1   crime$cluster   50 non-null     int64  
 2   Murder          50 non-null     float64
 3   Assault         50 non-null     int64  
 4   UrbanPop        50 non-null     int64  
 5   Rape            50 non-null     float64
dtypes: float64(2), int64(3), object(1)
memory usage: 2.5+ KB


data.describe()


import matplotlib.pyplot as plt

# set the plotting style to 'ggplot'
plt.style.use('ggplot')
data.plot.box()

<Axes: >


cols = ['Murder', 'Assault','Rape']
features = data[cols]
features.head()


from sklearn.preprocessing import StandardScaler

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(features)
X

array([[ 1.25517927,  0.79078716, -0.00345116],
       [ 0.51301858,  1.11805959,  2.50942392],
       [ 0.07236067,  1.49381682,  1.05346626],
       [ 0.23470832,  0.23321191, -0.18679398],
       [ 0.28109336,  1.2756352 ,  2.08881393],
       [ 0.02597562,  0.40290872,  1.88390137],
       [-1.04088037, -0.73648418, -1.09272319],
       [-0.43787481,  0.81502956, -0.58583422],
       [ 1.76541475,  1.99078607,  1.1505301 ],
       [ 2.22926518,  0.48775713,  0.49265293],
       [-0.57702994, -1.51224105, -0.11129987],
       [-1.20322802, -0.61527217, -0.75839217],
       [ 0.60578867,  0.94836277,  0.29852525],
       [-0.13637203, -0.70012057, -0.0250209 ],
       [-1.29599811, -1.39102904, -1.07115345],
       [-0.41468229, -0.67587817, -0.34856705],
       [ 0.44344101, -0.74860538, -0.53190987],
       [ 1.76541475,  0.94836277,  0.10439756],
       [-1.31919063, -1.06375661, -1.44862395],
       [ 0.81452136,  1.56654403,  0.70835037],
       [-0.78576263, -0.26375734, -0.53190987],
       [ 1.00006153,  1.02108998,  1.49564599],
       [-1.1800355 , -1.19708982, -0.68289807],
       [ 1.9277624 ,  1.06957478, -0.44563089],
       [ 0.28109336,  0.0877575 ,  0.75148985],
       [-0.41468229, -0.74860538, -0.521125  ],
       [-0.80895515, -0.83345379, -0.51034012],
       [ 1.02325405,  0.98472638,  2.671197  ],
       [-1.31919063, -1.37890783, -1.26528114],
       [-0.08998698, -0.14254532, -0.26228808],
       [ 0.83771388,  1.38472601,  1.17209984],
       [ 0.76813632,  1.00896878,  0.52500755],
       [ 1.20879423,  2.01502847, -0.55347961],
       [-1.62069341, -1.52436225, -1.50254831],
       [-0.11317951, -0.61527217,  0.01811858],
       [-0.27552716, -0.23951493, -0.13286962],
       [-0.66980002, -0.14254532,  0.87012344],
       [-0.34510472, -0.78496898, -0.68289807],
       [-1.01768785,  0.03927269, -1.39469959],
       [ 1.53348953,  1.3119988 ,  0.13675217],
       [-0.92491776, -1.027393  , -0.90938037],
       [ 1.25517927,  0.20896951,  0.61128652],
       [ 1.13921666,  0.36654512,  0.46029832],
       [-1.06407289, -0.61527217,  0.17989166],
       [-1.29599811, -1.48799864, -1.08193832],
       [ 0.16513075, -0.17890893, -0.05737552],
       [-0.87853272, -0.31224214,  0.53579242],
       [-0.48425985, -1.08799901, -1.28685088],
       [-1.20322802, -1.42739264, -1.1250778 ],
       [-0.22914211, -0.11830292, -0.60740397]])


plt.scatter(X[:,0], X[:,1])
plt.xlabel('Murder')
plt.ylabel('Assault')

Text(0, 0.5, 'Assault')


from fcmeans import FCM

# Define the number of clusters
n_clusters = 3

# Initialize the FCM algorithm
fcm = FCM(n_clusters=n_clusters)

# Fit the data to the FCM algorithm
fcm.fit(X)

# Get the cluster centers and the membership matrix
centroids = fcm.centers
membership_mat = fcm.u


import numpy as np

# Assign each data point to the cluster with the highest membership
labels = np.argmax(membership_mat, axis=1)
labels

array([2, 2, 2, 1, 2, 2, 0, 1, 2, 2, 0, 0, 2, 1, 0, 1, 1, 2, 0, 2, 1, 2,
       0, 2, 1, 1, 0, 2, 0, 1, 2, 2, 2, 0, 1, 1, 1, 0, 0, 2, 0, 2, 2, 1,
       0, 1, 1, 0, 0, 1], dtype=int64)


# or use predict() directly
labels = fcm.predict(X)
labels

array([2, 2, 2, 1, 2, 2, 0, 1, 2, 2, 0, 0, 2, 1, 0, 1, 1, 2, 0, 2, 1, 2,
       0, 2, 1, 1, 0, 2, 0, 1, 2, 2, 2, 0, 1, 1, 1, 0, 0, 2, 0, 2, 2, 1,
       0, 1, 1, 0, 0, 1], dtype=int64)


n_clusters_list = [2, 3, 4, 5, 6, 7]
models = list()
for n_clusters in n_clusters_list:
    fcm = FCM(n_clusters=n_clusters)
    fcm.fit(X)
    models.append(fcm)


# outputs
num_clusters = len(n_clusters_list)
rows = int(np.ceil(np.sqrt(num_clusters)))
cols = int(np.ceil(num_clusters / rows))
f, axes = plt.subplots(rows, cols, figsize=(11,12))
for n_clusters, model, axe in zip(n_clusters_list, models, axes.ravel()):
    # get validation metrics
    pc = model.partition_coefficient
    pec = model.partition_entropy_coefficient
    
    fcm_centers = model.centers
    fcm_labels = model.predict(X)
    # plot result
    axe.scatter(X[:,0], X[:,1], c=fcm_labels, alpha=.9)
    axe.scatter(fcm_centers[:,0], fcm_centers[:,1], marker="+", s=200, c='r')
    axe.set_title(f'n_clusters = {n_clusters}, PC = {pc:.3f}, PEC = {pec:.3f}')
plt.show()


# Define the colors for each cluster
colors = ['b', 'g', 'r', 'c', 'm']

n_clusters=3

# Plot the data points
for i in range(n_clusters):
    plt.scatter(X[labels == i, 0], X[labels == i, 1], c=colors[i])

# Plot the centroids
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=100, c='#050505')

# Set the axis labels
plt.xlabel('Murder')
plt.ylabel('Assault')

# Show the plot
plt.show()


results = features.copy()
results['Labels'] = labels
results.head()


import seaborn as sns

# Create a scatterplot matrix
sns.pairplot(results, hue='Labels', palette='Dark2')

<seaborn.axisgrid.PairGrid at 0x1f3f64f4700>

		crime$cluster	Murder	Assault	UrbanPop	Rape
0	Alabama	4	13.2	236	58	21.2
1	Alaska	4	10.0	263	48	44.5
2	Arizona	4	8.1	294	80	31.0
3	Arkansas	3	8.8	190	50	19.5
4	California	4	9.0	276	91	40.6

	crime$cluster	Murder	Assault	UrbanPop	Rape
count	50.000000	50.00000	50.000000	50.000000	50.000000
mean	2.720000	7.78800	170.760000	65.540000	21.232000
std	1.125584	4.35551	83.337661	14.474763	9.366385
min	1.000000	0.80000	45.000000	32.000000	7.300000
25%	2.000000	4.07500	109.000000	54.500000	15.075000
50%	3.000000	7.25000	159.000000	66.000000	20.100000
75%	4.000000	11.25000	249.000000	77.750000	26.175000
max	4.000000	17.40000	337.000000	91.000000	46.000000

	Murder	Assault	Rape
0	13.2	236	21.2
1	10.0	263	44.5
2	8.1	294	31.0
3	8.8	190	19.5
4	9.0	276	40.6

	Murder	Assault	Rape	Labels
0	13.2	236	21.2	2
1	10.0	263	44.5	2
2	8.1	294	31.0	2
3	8.8	190	19.5	1
4	9.0	276	40.6	2

Table of Contents

1. Install the Package¶

2. Explore the Data¶

2.1 Import Libraries and Load Data¶

2.2 General information of the data¶

2.4 Descriptive statistics¶

3. Preprocess data¶

3.1 Extract the features¶

3.2 Standardize the Data¶

4. Implement FCM¶

4.1 Number of Clusters¶

4.2 Assign Data Points to Clusters¶

4.3 Clustering validation¶

5. Visualize the Results¶

5.1 A Scatter plot of Clusters¶

5.2 Add the cluster labels to the feature DataFrame¶

5.3 A scatter matrix plot of the cluster results¶

Conclusions¶