"""
cmeans.py : Fuzzy C-means clustering algorithm.
"""
import numpy as np
from scipy.spatial.distance import cdist
from .normalize_columns import normalize_columns, normalize_power_columns
def _cmeans0(data, u_old, c, m, metric):
"""
Single step in generic fuzzy c-means clustering algorithm.
Modified from Ross, Fuzzy Logic w/Engineering Applications (2010),
pages 352-353, equations 10.28 - 10.35.
Parameters inherited from cmeans()
"""
# Normalizing, then eliminating any potential zero values.
u_old = normalize_columns(u_old)
u_old = np.fmax(u_old, np.finfo(np.float64).eps)
um = u_old ** m
# Calculate cluster centers
data = data.T
cntr = um.dot(data) / np.atleast_2d(um.sum(axis=1)).T
d = _distance(data, cntr, metric)
d = np.fmax(d, np.finfo(np.float64).eps)
jm = (um * d ** 2).sum()
u = normalize_power_columns(d, - 2. / (m - 1))
return cntr, u, jm, d
def _distance(data, centers, metric='euclidean'):
"""
Euclidean distance from each point to each cluster center.
Parameters
----------
data : 2d array (N x Q)
Data to be analyzed. There are N data points.
centers : 2d array (C x Q)
Cluster centers. There are C clusters, with Q features.
metric: string
By default is set to euclidean. Passes any option accepted by
``scipy.spatial.distance.cdist``.
Returns
-------
dist : 2d array (C x N)
Euclidean distance from each point, to each cluster center.
See Also
--------
scipy.spatial.distance.cdist
"""
return cdist(data, centers, metric=metric).T
def _fp_coeff(u):
"""
Fuzzy partition coefficient `fpc` relative to fuzzy c-partitioned
matrix `u`. Measures 'fuzziness' in partitioned clustering.
Parameters
----------
u : 2d array (C, N)
Fuzzy c-partitioned matrix; N = number of data points and C = number
of clusters.
Returns
-------
fpc : float
Fuzzy partition coefficient.
"""
n = u.shape[1]
return np.trace(u.dot(u.T)) / float(n)
[docs]def cmeans(data, c, m, error, maxiter,
metric='euclidean',
init=None, seed=None):
"""
Fuzzy c-means clustering algorithm [1].
Parameters
----------
data : 2d array, size (S, N)
Data to be clustered. N is the number of data sets; S is the number
of features within each sample vector.
c : int
Desired number of clusters or classes.
m : float
Array exponentiation applied to the membership function u_old at each
iteration, where U_new = u_old ** m.
error : float
Stopping criterion; stop early if the norm of (u[p] - u[p-1]) < error.
maxiter : int
Maximum number of iterations allowed.
metric: string
By default is set to euclidean. Passes any option accepted by
``scipy.spatial.distance.cdist``.
init : 2d array, size (c, N)
Initial fuzzy c-partitioned matrix. If none provided, algorithm is
randomly initialized.
seed : int
If provided, sets random seed of init. No effect if init is
provided. Mainly for debug/testing purposes.
Returns
-------
cntr : 2d array, size (c, S)
Cluster centers. Data for each center along each feature provided
for every cluster (of the `c` requested clusters).
u : 2d array, (c, N)
Final fuzzy c-partitioned matrix.
u0 : 2d array, (c, N)
Initial guess at fuzzy c-partitioned matrix (either provided init or
random guess used if init was not provided).
d : 2d array, (c, N)
Final Euclidian distance matrix.
jm : 1d array, length P
Objective function history.
p : int
Number of iterations run.
fpc : float
Final fuzzy partition coefficient.
Notes
-----
The algorithm implemented is from Ross et al. [1]_.
Fuzzy C-Means has a known problem with high dimensionality datasets, where
the majority of cluster centers are pulled into the overall center of
gravity. If you are clustering data with very high dimensionality and
encounter this issue, another clustering method may be required. For more
information and the theory behind this, see Winkler et al. [2]_.
References
----------
.. [1] Ross, Timothy J. Fuzzy Logic With Engineering Applications, 3rd ed.
Wiley. 2010. ISBN 978-0-470-74376-8 pp 352-353, eq 10.28 - 10.35.
.. [2] Winkler, R., Klawonn, F., & Kruse, R. Fuzzy c-means in high
dimensional spaces. 2012. Contemporary Theory and Pragmatic
Approaches in Fuzzy Computing Utilization, 1.
"""
# Setup u0
if init is None:
if seed is not None:
np.random.seed(seed=seed)
n = data.shape[1]
u0 = np.random.rand(c, n)
u0 = normalize_columns(u0)
init = u0.copy()
u0 = init
u = np.fmax(u0, np.finfo(np.float64).eps)
# Initialize loop parameters
jm = np.zeros(0)
p = 0
# Main cmeans loop
while p < maxiter - 1:
u2 = u.copy()
[cntr, u, Jjm, d] = _cmeans0(data, u2, c, m, metric)
jm = np.hstack((jm, Jjm))
p += 1
# Stopping rule
if np.linalg.norm(u - u2) < error:
break
# Final calculations
error = np.linalg.norm(u - u2)
fpc = _fp_coeff(u)
return cntr, u, u0, d, jm, p, fpc
[docs]def cmeans_predict(test_data, cntr_trained, m, error, maxiter,
metric='euclidean',
init=None,
seed=None):
"""
Prediction of new data in given a trained fuzzy c-means framework [1].
Parameters
----------
test_data : 2d array, size (S, N)
New, independent data set to be predicted based on trained c-means
from ``cmeans``. N is the number of data sets; S is the number of
features within each sample vector.
cntr_trained : 2d array, size (c, S)
Location of trained centers from prior training c-means.
m : float
Array exponentiation applied to the membership function u_old at each
iteration, where U_new = u_old ** m.
error : float
Stopping criterion; stop early if the norm of (u[p] - u[p-1]) < error.
maxiter : int
Maximum number of iterations allowed.
metric: string
By default is set to euclidean. Passes any option accepted by
``scipy.spatial.distance.cdist``.
init : 2d array, size (c, N)
Initial fuzzy c-partitioned matrix. If none provided, algorithm is
randomly initialized.
seed : int
If provided, sets random seed of init. No effect if init is
provided. Mainly for debug/testing purposes.
Returns
-------
u : 2d array, (c, N)
Final fuzzy c-partitioned matrix.
u0 : 2d array, (c, N)
Initial guess at fuzzy c-partitioned matrix (either provided init or
random guess used if init was not provided).
d : 2d array, (c, N)
Final Euclidian distance matrix.
jm : 1d array, length P
Objective function history.
p : int
Number of iterations run.
fpc : float
Final fuzzy partition coefficient.
Notes
-----
Ross et al. [1]_ did not include a prediction algorithm to go along with
fuzzy c-means. This prediction algorithm works by repeating the clustering
with fixed centers, then efficiently finds the fuzzy membership at all
points.
References
----------
.. [1] Ross, Timothy J. Fuzzy Logic With Engineering Applications, 3rd ed.
Wiley. 2010. ISBN 978-0-470-74376-8 pp 352-353, eq 10.28 - 10.35.
"""
c = cntr_trained.shape[0]
# Setup u0
if init is None:
if seed is not None:
np.random.seed(seed=seed)
n = test_data.shape[1]
u0 = np.random.rand(c, n)
u0 = normalize_columns(u0)
init = u0.copy()
u0 = init
u = np.fmax(u0, np.finfo(np.float64).eps)
# Initialize loop parameters
jm = np.zeros(0)
p = 0
# Main cmeans loop
while p < maxiter - 1:
u2 = u.copy()
[u, Jjm, d] = _cmeans_predict0(test_data, cntr_trained, u2, c, m,
metric)
jm = np.hstack((jm, Jjm))
p += 1
# Stopping rule
if np.linalg.norm(u - u2) < error:
break
# Final calculations
error = np.linalg.norm(u - u2)
fpc = _fp_coeff(u)
return u, u0, d, jm, p, fpc
def _cmeans_predict0(test_data, cntr, u_old, c, m, metric):
"""
Single step in fuzzy c-means prediction algorithm. Clustering algorithm
modified from Ross, Fuzzy Logic w/Engineering Applications (2010)
p.352-353, equations 10.28 - 10.35, but this method to generate fuzzy
predictions was independently derived by Josh Warner.
Parameters inherited from cmeans()
Very similar to initial clustering, except `cntr` is not updated, thus
the new test data are forced into known (trained) clusters.
"""
# Normalizing, then eliminating any potential zero values.
u_old = normalize_columns(u_old)
u_old = np.fmax(u_old, np.finfo(np.float64).eps)
um = u_old ** m
test_data = test_data.T
# For prediction, we do not recalculate cluster centers. The test_data is
# forced to conform to the prior clustering.
d = _distance(test_data, cntr, metric)
d = np.fmax(d, np.finfo(np.float64).eps)
jm = (um * d ** 2).sum()
u = normalize_power_columns(d, - 2. / (m - 1))
return u, jm, d