Source code for graph_tool.clustering

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# graph_tool -- a general graph manipulation python module
#
# Copyright (C) 2006-2025 Tiago de Paula Peixoto <tiago@skewed.de>
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""
``graph_tool.clustering``
=========================


This module provides algorithms for calculation of clustering coefficients,
a.k.a. transitivity, and related concepts.

Summary
+++++++

.. autosummary::
   :nosignatures:
   :toctree: autosummary

   local_clustering
   global_clustering
   extended_clustering
   motifs
   motif_significance

"""

from .. dl_import import dl_import
dl_import("from . import libgraph_tool_clustering as _gt")

from .. import _prop, Graph, GraphView, VertexPropertyMap, _get_rng, _parallel
from .. topology import isomorphism
from .. generation import random_rewire
from .. stats import vertex_hist

from collections import defaultdict

import numpy as np

__all__ = ["local_clustering", "global_clustering", "extended_clustering",
           "motifs", "motif_significance"]



[docs]
@_parallel
def local_clustering(g, weight=None, prop=None, undirected=True):
    r"""Return the local clustering coefficients for all vertices.

    Parameters
    ----------
    g : :class:`~graph_tool.Graph`
        Graph to be used.
    weight : :class:`~graph_tool.EdgePropertyMap`, optional (default: None)
        Edge weights. If omitted, a constant value of 1 will be used.
    prop : :class:`~graph_tool.VertexPropertyMap` or string, optional
        Vertex property map where results will be stored. If specified, this
        parameter will also be the return value.
    undirected : bool (default: ``True``)
        Calculate the *undirected* clustering coefficient, if graph is directed
        (this option has no effect if the graph is undirected).

    Returns
    -------
    prop : :class:`~graph_tool.VertexPropertyMap`
        Vertex property containing the clustering coefficients.

    See Also
    --------
    global_clustering: global clustering coefficient
    extended_clustering: extended (generalized) clustering coefficient
    motifs: motif counting

    Notes
    -----
    The local clustering coefficient [watts-collective-1998]_ :math:`c_i` is
    defined as

    .. math::
       c_i = \frac{|\{(j,k)\}|}{d_i(d_i-1)} :\, j,k \in N_i,\, (j,k) \in E

    where :math:`d_i` is the out-degree of vertex :math:`i`, and

    .. math::
       N_i = \{j : (i,j) \in E\}

    is the set of out-neighbors of vertex :math:`i`. For undirected graphs the
    value of :math:`c_i` further normalized as

    .. math::
       c_i \leftarrow  2c_i.

    If weights are given, the following definition is used
    [zhang-general-2005]_:

    .. math::
       c_i = \frac{\sum_{jk}A_{ij}A_{jk}A_{ik}}{\sum_{j\ne k}A_{ij}A_{ik}},

    where :math:`\boldsymbol A` is the weighted adjacency matrix, and it is
    assumed that the weights are normalized in the unit interval, i.e.
    :math:`A_{ij} \in [0,1]`.

    The implemented algorithm runs in :math:`O(|V|\left<k^2\right>)` time,
    where :math:`\left<k^2\right>` is second moment of the degree distribution.

    @parallel@

    Examples
    --------
    >>> g = gt.collection.data["karate"]
    >>> clust = gt.local_clustering(g)
    >>> print(gt.vertex_average(g, clust))
    (0.5706384782..., 0.05869813676...)

    References
    ----------
    .. [watts-collective-1998] D. J. Watts and Steven Strogatz, "Collective
       dynamics of 'small-world' networks", Nature, vol. 393, pp 440-442, 1998.
       :doi:`10.1038/30918`
    .. [zhang-general-2005] Zhang B, Horvath S, "A general framework for
       weighted gene co-expression network analysis", Stat Appl Genet Mol Biol. 4
       17 (2005) :doi:`10.2202/1544-6115.1128`

    """

    if prop is None:
        prop = g.new_vertex_property("double")
    if g.is_directed() and undirected:
        g = GraphView(g, directed=False, skip_properties=True)
    _gt.local_clustering(g._Graph__graph, _prop("v", g, prop),
                         _prop("e", g, weight))
    return prop




[docs]
@_parallel
def global_clustering(g, weight=None, ret_counts=False, sampled=False, m=1000):
    r"""Return the global clustering coefficient.

    Parameters
    ----------
    g : :class:`~graph_tool.Graph`
        Graph to be used.
    weight : :class:`~graph_tool.EdgePropertyMap`, optional (default: None)
        Edge weights. If omitted, a constant value of 1 will be used.
    ret_counts : ``boolean`` (optional, default: ``False``)
        If ``True`` the number of triangles and connected triples are also
        returned.
    sampled : bool (default: ``False``)
        If ``True`` a much faster, assymptotically exact, sampling estimate is
        performed. In this case the, ``weight`` option is ignored.
    m : int (default: ``1000``)
        If ``sampled is True``, this will be the number of samples used for the
        estimation.

    Returns
    -------
    c : tuple of floats or float
        Global clustering coefficient and standard deviation (jackknife method).
        If ``sampled is True`` only the clustering coefficient is returned.
    triangles : `int` (if ``ret_counts is True``)
        Number of triangles. Not returned if ``sampled is True``.
    triples : `int` (if ``ret_counts is True``)
        Number of connected triples. Not returned if ``sampled is True``.

    See Also
    --------
    local_clustering: local clustering coefficient
    extended_clustering: extended (generalized) clustering coefficient
    motifs: motif counting

    Notes
    -----
    The global clustering coefficient [newman-structure-2003]_ :math:`c` is
    defined as

    .. math::
       c = 3 \times \frac{\text{number of triangles}}
                          {\text{number of connected triples}}

    If weights are given, the following definition is used
    [zhang-general-2005]_:

    .. math::
       c = \frac{\mathrm{Tr}{{\boldsymbol A}^3}}{\sum_{i\ne j}[{\boldsymbol A}^2]_{ij}},

    where :math:`\boldsymbol A` is the weighted adjacency matrix, and it is
    assumed that the weights are normalized in the unit interval, i.e.
    :math:`A_{ij} \in [0,1]`.

    The implemented algorithm runs in time :math:`O(|V|\left<k^2\right>)`,
    where :math:`\left< k^2\right>` is the second moment of the degree
    distribution.

    @parallel@

    Examples
    --------
    >>> g = gt.collection.data["karate"]
    >>> print(gt.global_clustering(g))
    (0.2556818181..., 0.06314746595...)

    References
    ----------
    .. [newman-structure-2003] M. E. J. Newman, "The structure and function of
       complex networks", SIAM Review, vol. 45, pp. 167-256, 2003,
       :doi:`10.1137/S003614450342480`
    .. [zhang-general-2005] Zhang B, Horvath S, "A general framework for
       weighted gene co-expression network analysis", Stat Appl Genet Mol Biol. 4
       17 (2005) :doi:`10.2202/1544-6115.1128`
    """

    if g.is_directed():
        g = GraphView(g, directed=False, skip_properties=True)
    if not sampled:
        c = _gt.global_clustering(g._Graph__graph, _prop("e", g, weight))
    else:
        return _gt.global_clustering_sampled(g._Graph__graph, m, _get_rng())
    if ret_counts:
        return c[:2], c[2], c[3]
    else:
        return c[:2]




[docs]
@_parallel
def extended_clustering(g, props=None, max_depth=3, undirected=False):
    r"""
    Return the extended clustering coefficients for all vertices.

    Parameters
    ----------
    g : :class:`~graph_tool.Graph`
        Graph to be used.
    props : list of :class:`~graph_tool.VertexPropertyMap` objects (optional, default: ``None``)
        list of vertex property maps where results will be stored. If specified,
        this parameter will also be the return value.
    max_depth : int (optional, default: ``3``)
        Maximum clustering order.
    undirected : boolean (optional, default: ``False``)
        Calculate the *undirected* clustering coefficients, if graph is directed
        (this option has no effect if the graph is undirected).

    Returns
    -------
    prop : list of :class:`~graph_tool.VertexPropertyMap` objects
        List of vertex properties containing the clustering coefficients.

    See Also
    --------
    local_clustering: local clustering coefficient
    global_clustering: global clustering coefficient
    motifs: motif counting

    Notes
    -----
    The extended clustering coefficient :math:`c^d_i` of order :math:`d` is
    defined as

    .. math::
       c^d_i = \frac{\left|\right\{ \{u,v\}; u,v \in N_i | d_{G(V\setminus
       \{i\})}(u,v) = d \left\}\right|}{{\left|N_i\right| \choose 2}},

    where :math:`d_G(u,v)` is the shortest distance from vertex :math:`u` to
    :math:`v` in graph :math:`G`, and

    .. math::
       N_i = \{v_j : e_{ij} \in E\}

    is the set of out-neighbors of :math:`i`. According to the above
    definition, we have that the traditional local clustering coefficient is
    recovered for :math:`d=1`, i.e., :math:`c^1_i = c_i`.

    The implemented algorithm runs in
    :math:`O(|V|\left<k\right>^{2+\text{max-depth}})` worst time, where
    :math:`\left< k\right>` is the average out-degree.

    @parallel@

    Examples
    --------
    >>> g = gt.collection.data["karate"]
    >>> clusts = gt.extended_clustering(g, max_depth=5)
    >>> for i in range(0, 5):
    ...    print(gt.vertex_average(g, clusts[i]))
    ...
    (0.5706384782076..., 0.05869813676256...)
    (0.3260389360735..., 0.04810773205917...)
    (0.0530678759917..., 0.01513061504691...)
    (0.0061658977316..., 0.00310690511463...)
    (0.0002162629757..., 0.00021305890271...)

    References
    ----------
    .. [abdo-clustering] A. H. Abdo, A. P. S. de Moura, "Clustering as a
       measure of the local topology of networks", :arxiv:`physics/0605235`
    """

    if g.is_directed() and undirected:
        g = GraphView(g, directed=False)
    if props is None:
        props = []
        for i in range(0, max_depth):
            props.append(g.new_vertex_property("double"))
    _gt.extended_clustering(g._Graph__graph,
                            [_prop("v", g, p) for p in props])
    return props




[docs]
@_parallel
def motifs(g, k, p=1.0, motif_list=None, return_maps=False):
    r"""
    Count the occurrence of k-size node-induced subgraphs (motifs). A tuple with
    two lists is returned: the list of motifs found, and the list with their
    respective counts.

    Parameters
    ----------
    g : :class:`~graph_tool.Graph`
        Graph to be used.
    k : int
        number of vertices of the motifs
    p : float or float list (optional, default: `1.0`)
        uniform fraction of the motifs to be sampled. If a float list is
        provided, it will be used as the fraction at each depth
        :math:`[1,\dots,k]` in the algorithm. See [wernicke-efficient-2006]_ for
        more details.
    motif_list : list of :class:`~graph_tool.Graph` objects, optional
        If supplied, the algorithms will only search for the motifs in this list
        (or isomorphisms).
    return_maps : bool (optional, default `False`)
        If ``True``, a list will be returned, which provides for each motif graph a
        list of vertex property maps which map the motif to its location in the
        main graph.

    Returns
    -------
    motifs : list of :class:`~graph_tool.Graph` objects
        List of motifs of size k found in the Graph. Graphs are grouped
        according to their isomorphism class, and only one of each class appears
        in this list. The list is sorted according to in-degree sequence,
        out-degree-sequence, and number of edges (in this order).
    counts : list of ints
        The number of times the respective motif in the motifs list was counted
    vertex_maps : list of lists of :class:`~graph_tool.VertexPropertyMap` objects
        List for each motif graph containing the locations in the main
        graph. This is only returned if `return_maps == True`.

    See Also
    --------
    motif_significance: significance profile of motifs
    local_clustering: local clustering coefficient
    global_clustering: global clustering coefficient
    extended_clustering: extended (generalized) clustering coefficient

    Notes
    -----
    This functions implements the ESU and RAND-ESU algorithms described in
    [wernicke-efficient-2006]_.

    @parallel@

    Examples
    --------
    .. testcode::
       :hide:

       np.random.seed(42)
       gt.seed_rng(42)

    >>> g = gt.random_graph(1000, lambda: (5,5))
    >>> motifs, counts = gt.motifs(gt.GraphView(g, directed=False), 4)
    >>> print(len(motifs))
    11
    >>> print(counts)
    [115626, 390445, 662, 763, 2816, 1686, 776, 10, 5, 11, 3]

    References
    ----------
    .. [wernicke-efficient-2006] S. Wernicke, "Efficient detection of network
       motifs", IEEE/ACM Transactions on Computational Biology and
       Bioinformatics (TCBB), Volume 3, Issue 4, Pages 347-359, 2006.
       :doi:`10.1109/TCBB.2006.51`
    .. [induced-subgraph-isomorphism] http://en.wikipedia.org/wiki/Induced_subgraph_isomorphism_problem
    """

    sub_list = []
    directed_motifs = g.is_directed()

    if motif_list is not None:
        directed_motifs = motif_list[0].is_directed()
        for m in motif_list:
            if m.is_directed() != directed_motifs:
                raise ValueError("all motif graphs must be either directed or undirected!")
            if m.num_vertices() != k:
                raise ValueError("all motifs must have the same number of vertices: %d" % k)
            sub_list.append(m._Graph__graph)

    if directed_motifs != g.is_directed():
        raise ValueError("motifs do not have the same directionality as the graph itself!")

    if isinstance(p, float):
        pd = [1.0] * (k - 1)
        pd.append(p)
    elif isinstance(p, list):
        pd = [float(x) for x in p]

    hist = []
    vertex_maps = []
    _gt.get_motifs(g._Graph__graph, k, sub_list, hist, vertex_maps, return_maps,
                   pd, True, len(sub_list) == 0, _get_rng())

    # assemble graphs
    temp = []
    for m in sub_list:
        mg = Graph()
        mg._Graph__graph = m
        mg.reindex_edges()
        temp.append(mg)
    sub_list = temp

    if return_maps:
        list_hist = list(zip(sub_list, hist, vertex_maps))
    else:
        list_hist = list(zip(sub_list, hist))
    # sort according to in-degree sequence
    list_hist.sort(key=lambda x: sorted([v.in_degree() for v in x[0].vertices()]))

    # sort according to out-degree sequence
    list_hist.sort(key=lambda x: sorted([v.out_degree() for v in x[0].vertices()]))

    # sort according to ascending number of edges
    list_hist.sort(key=lambda x: x[0].num_edges())

    sub_list = [x[0] for x in list_hist]
    hist = [x[1] for x in list_hist]

    if return_maps:
        vertex_maps = [x[2] for x in list_hist]
        for i, vlist in enumerate(vertex_maps):
            sub = sub_list[i]
            vertex_maps[i] = [VertexPropertyMap(vm, sub) for vm in vlist]
        return sub_list, hist, vertex_maps
    return sub_list, hist



def _graph_sig(g):
    """return the graph signature, i.e., the in and out degree distribution as
    concatenated as a tuple."""
    bins = list(range(0, g.num_vertices() + 1))
    in_dist = vertex_hist(g, "in", bins=bins if g.is_directed() else [0, 1],
                          float_count=False)
    out_dist = vertex_hist(g, "out", bins=bins, float_count=False)
    sig = tuple([(in_dist[1][i], in_dist[0][i]) for \
                 i in range(len(in_dist[0]))] +
                [(out_dist[1][i], out_dist[0][i]) for\
                 i in range(len(out_dist[0]))])
    return sig



[docs]
@_parallel
def motif_significance(g, k, n_shuffles=100, p=1.0, motif_list=None,
                       threshold=0, self_loops=False, parallel_edges=False,
                       full_output=False, shuffle_model="configuration"):
    r"""
    Obtain the motif significance profile, for subgraphs with k vertices. A
    tuple with two lists is returned: the list of motifs found, and their
    respective z-scores.

    Parameters
    ----------
    g : :class:`~graph_tool.Graph`
        Graph to be used.
    k : int
        Number of vertices of the motifs
    n_shuffles : int (optional, default: 100)
        Number of shuffled networks to consider for the z-score
    p : float or float list (optional, default: 1.0)
        Uniform fraction of the motifs to be sampled. If a float list is
        provided, it will be used as the fraction at each depth
        :math:`[1,\dots,k]` in the algorithm. See [wernicke-efficient-2006]_ for
        more details.
    motif_list : list of :class:`~graph_tool.Graph` objects (optional, default: None)
        If supplied, the algorithms will only search for the motifs in this list
        (isomorphisms)
    threshold : int (optional, default: 0)
        If a given motif count is below this level, it is not considered.
    self_loops : bool (optional, default: False)
        Whether or not the shuffled graphs are allowed to contain self-loops
    parallel_edges : bool (optional, default: False)
        Whether or not the shuffled graphs are allowed to contain parallel
        edges.
    full_output : bool (optional, default: False)
        If set to True, three additional lists are returned: the count
        of each motif, the average count of each motif in the shuffled networks,
        and the standard deviation of the average count of each motif in the
        shuffled networks.
    shuffle_model : string (optional, default: "configuration")
        Shuffle model to use. See :func:`~graph_tool.generation.random_rewire`
        for details.

    Returns
    -------
    motifs : list of :class:`~graph_tool.Graph` objects
        List of motifs of size k found in the Graph. Graphs are grouped
        according to their isomorphism class, and only one of each class appears
        in this list. The list is sorted according to in-degree sequence,
        out-degree-sequence, and number of edges (in this order).
    z-scores : list of floats
        The z-score of the respective motives. See below for the definition of
        the z-score.

    See Also
    --------
    motifs: motif counting or sampling
    local_clustering: local clustering coefficient
    global_clustering: global clustering coefficient
    extended_clustering: extended (generalized) clustering coefficient

    Notes
    -----
    The z-score :math:`z_i` of motif i is defined as

    .. math::
         z_i = \frac{N_i - \left<N^s_i\right>}
         {\sqrt{\left<(N^s_i)^2\right> - \left<N^s_i\right>^2}},

    where :math:`N_i` is the number of times motif i found, and :math:`N^s_i`
    is the count of the same motif but on a shuffled network. It measures how
    many standard deviations is each motif count, in respect to an ensemble of
    randomly shuffled graphs with the same degree sequence.

    The z-score values are not normalized.

    @parallel@

    Examples
    --------
    >>> from numpy import random
    >>> random.seed(10)
    >>> g = gt.random_graph(100, lambda: (3,3))
    >>> motifs, zscores = gt.motif_significance(g, 3)
    >>> print(len(motifs))
    11
    >>> print(zscores)
    [0.6047878414563249, 0.6094680990819686, 1.0229305974010572, 0.39872798688563355, -0.6865659282127776, -0.7003194037662325, -1.1702999575943225, -0.15, -0.15, -0.29, -0.15]

    References
    ----------
    .. [wernicke-efficient-2006] S. Wernicke, "Efficient detection of network
       motifs", IEEE/ACM Transactions on Computational Biology and
       Bioinformatics (TCBB), Volume 3, Issue 4, Pages 347-359, 2006.
       :doi:`10.1109/TCBB.2006.51`
    """

    s_ms, counts = motifs(g, k, p, motif_list)
    if threshold > 0:
        s_ms, counts = list(zip(*[x for x in zip(s_ms, counts) if x[1] > threshold]))
        s_ms = list(s_ms)
        counts = list(counts)
    s_counts = [0] * len(s_ms)
    s_dev = [0] * len(s_ms)

    # group subgraphs by number of edges
    m_e = defaultdict(lambda: [])
    for i in range(len(s_ms)):
        m_e[_graph_sig(s_ms[i])].append(i)

    # get samples
    sg = g.copy()
    for i in range(0, n_shuffles):
        random_rewire(sg, model=shuffle_model, self_loops=self_loops,
                      parallel_edges=parallel_edges)
        m_temp, count_temp = motifs(sg, k, p, motif_list)
        if threshold > 0:
            m_temp, count_temp = list(zip(*[x for x in zip(m_temp, count_temp) \
                                       if x[1] > threshold]))
        for j in range(0, len(m_temp)):
            found = False
            for l in m_e[_graph_sig(m_temp[j])]:
                if isomorphism(s_ms[l], m_temp[j]):
                    found = True
                    s_counts[l] += count_temp[j]
                    s_dev[l] += count_temp[j] ** 2
            if not found:
                s_ms.append(m_temp[j])
                s_counts.append(count_temp[j])
                s_dev.append(count_temp[j] ** 2)
                counts.append(0)
                m_e[_graph_sig(m_temp[j])].append(len(s_ms) - 1)

    s_counts = [x / float(n_shuffles) for x in s_counts]
    s_dev = [max((np.sqrt(x[0] / float(n_shuffles) - x[1] ** 2), 1)) \
              for x in zip(s_dev, s_counts)]

    list_hist = list(zip(s_ms, s_counts, s_dev))
    # sort according to in-degree sequence
    list_hist.sort(key = lambda x: sorted([v.in_degree() for v in x[0].vertices()])),

    # sort according to out-degree sequence
    list_hist.sort(key = lambda x: sorted([v.out_degree() for v in x[0].vertices()]))

    # sort according to ascending number of edges
    list_hist.sort(key = lambda x: x[0].num_edges())

    s_ms, s_counts, s_dev = list(zip(*list_hist))

    zscore = [float((x[0] - x[1]) / x[2]) for x in zip(counts, s_counts, s_dev)]

    if full_output:
        return s_ms, zscore, counts, s_counts, s_dev
    else:
        return s_ms, zscore