Source code for prestools.clustering

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Created by Roberto Preste
import pandas as pd
import numpy as np
import scipy.cluster.hierarchy as sch
import scipy.spatial.distance as ssd
import matplotlib.pyplot as plt
from typing import Union
from .classes import HierCluster


[docs]def hierarchical_clustering(df: Union[pd.DataFrame, np.ndarray],
                            method: str = "ward") -> Union[HierCluster,
                                                           None, ValueError]:
    """Hierarchical cluster of a dataframe.

    Return clustering created using scipy from a given dataframe of
    correlations, using the HierCluster class available in
    prestools.classes.

    See Also:
        https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html

    Args:
        df: input dataframe of correlations
        method: method to use to cluster the data ('ward', 'single',
            'complete', 'average', 'weighted', 'centroid', 'median')
            (default: 'ward')

    Returns:
        cl: instance of prestools.classes.HierCluster()
    """
    if method not in ["ward", "single", "complete", "average",
                      "weighted", "centroid", "median"]:
        return ValueError("Method not valid!")
    if df.shape == (0, 0) or df.shape == (1, 1):
        return
    cl = HierCluster()
    cl.linkage = sch.linkage(df, method=method)
    cl.pair_dist = ssd.pdist(df)
    cl.coph_dist, cl.coph_matr = sch.cophenet(cl.linkage, cl.pair_dist)

    return cl


[docs]def find_n_clusters_elbow(df: Union[pd.DataFrame, np.ndarray],
                          plot: bool = False,
                          method: str = "ward") -> Union[int,
                                                         None, ValueError]:
    """Find the suggested number of clusters using the elbow method.

    Find the suggested number of clusters for the given dataframe of
    correlations, using the elbow method.

    Args:
        df: input dataframe of correlations
        plot: plot the resulting elbow plot (default: False)
        method: method to use to cluster the data ('ward', 'single',
            'complete', 'average', 'weighted', 'centroid', 'median')
            (default: 'ward')

    Returns:
        n_clusters: number of clusters found
    """
    if method not in ["ward", "single", "complete", "average",
                      "weighted", "centroid", "median"]:
        return ValueError("Method not valid!")
    if df.shape == (0, 0) or df.shape == (1, 1):
        return
    cl = hierarchical_clustering(df, method=method)
    Z = cl.linkage[:, 2]
    acceleration = np.diff(Z, 2)
    acceleration_rev = acceleration[::-1]
    if len(acceleration_rev) == 0:
        return
    n_clusters = acceleration_rev.argmax() + 2

    if plot:
        Z_rev = Z[::-1]
        idxs = np.arange(1, len(Z) + 1)
        plt.plot(idxs, Z_rev)
        plt.plot(idxs[:-2] + 1, acceleration_rev)
        plt.axvline(n_clusters, color="red", linestyle="--")
        plt.xlabel("num of clusters", fontsize=14)
        plt.ylabel("acceleration of distance growth", fontsize=14)
        plt.title("Suggested number of clusters: {}".format(n_clusters),
                  fontsize=22)
        plt.show()

    return n_clusters