# Source code for graph_tool.clustering

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# graph_tool -- a general graph manipulation python module
#
# Copyright (C) 2006-2021 Tiago de Paula Peixoto <tiago@skewed.de>
#
# This program is free software; you can redistribute it and/or modify it under
# Software Foundation; either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""
graph_tool.clustering - Clustering coefficients
---------------------------------------------------

Provides algorithms for calculation of clustering coefficients,
aka. transitivity.

Summary
+++++++

.. autosummary::
:nosignatures:

local_clustering
global_clustering
extended_clustering
motifs
motif_significance

Contents
++++++++
"""

from .. dl_import import dl_import
dl_import("from . import libgraph_tool_clustering as _gt")

from .. import _prop, Graph, GraphView, VertexPropertyMap, _get_rng
from .. topology import isomorphism
from .. generation import random_rewire
from .. stats import vertex_hist

from collections import defaultdict
from numpy import *

__all__ = ["local_clustering", "global_clustering", "extended_clustering",
"motifs", "motif_significance"]

[docs]def local_clustering(g, weight=None, prop=None, undirected=True):
r"""
Return the local clustering coefficients for all vertices.

Parameters
----------
g : :class:~graph_tool.Graph
Graph to be used.
weight : :class:~graph_tool.EdgePropertyMap, optional (default: None)
Edge weights. If omitted, a constant value of 1 will be used.
prop : :class:~graph_tool.VertexPropertyMap or string, optional
Vertex property map where results will be stored. If specified, this
parameter will also be the return value.
undirected : bool (default: True)
Calculate the *undirected* clustering coefficient, if graph is directed
(this option has no effect if the graph is undirected).

Returns
-------
prop : :class:~graph_tool.VertexPropertyMap
Vertex property containing the clustering coefficients.

--------
global_clustering: global clustering coefficient
extended_clustering: extended (generalized) clustering coefficient
motifs: motif counting

Notes
-----
The local clustering coefficient [watts-collective-1998]_ :math:c_i is
defined as

.. math::
c_i = \frac{|\{e_{jk}\}|}{k_i(k_i-1)} :\, v_j,v_k \in N_i,\, e_{jk} \in E

where :math:k_i is the out-degree of vertex :math:i, and

.. math::
N_i = \{v_j : e_{ij} \in E\}

is the set of out-neighbors of vertex :math:i. For undirected graphs the
value of :math:c_i is normalized as

.. math::
c'_i = 2c_i.

The implemented algorithm runs in :math:O(|V|\left<k^2\right>) time,
where :math:\left<k^2\right> is second moment of the degree distribution.

If enabled during compilation, this algorithm runs in parallel.

Examples
--------
>>> g = gt.collection.data["karate"]
>>> clust = gt.local_clustering(g)
>>> print(gt.vertex_average(g, clust))
(0.5706384782..., 0.05869813676...)

References
----------
.. [watts-collective-1998] D. J. Watts and Steven Strogatz, "Collective
dynamics of 'small-world' networks", Nature, vol. 393, pp 440-442, 1998.
:doi:10.1038/30918
"""

if prop is None:
prop = g.new_vertex_property("double")
if g.is_directed() and undirected:
g = GraphView(g, directed=False, skip_properties=True)
_gt.local_clustering(g._Graph__graph, _prop("v", g, prop),
_prop("e", g, weight))
return prop

[docs]def global_clustering(g, weight=None, ret_counts=False):
r"""Return the global clustering coefficient.

Parameters
----------
g : :class:~graph_tool.Graph
Graph to be used.
weight : :class:~graph_tool.EdgePropertyMap, optional (default: None)
Edge weights. If omitted, a constant value of 1 will be used.
ret_counts : boolean (optional, default: False)
If True the number of triangles and connected triples are also
returned.

Returns
-------
c : tuple of floats
Global clustering coefficient and standard deviation (jackknife method)
triangles : int (if ret_counts is True)
Number of triangles.
triples : int (if ret_counts is True)
Number of connected triples.

--------
local_clustering: local clustering coefficient
extended_clustering: extended (generalized) clustering coefficient
motifs: motif counting

Notes
-----
The global clustering coefficient [newman-structure-2003]_ :math:c is
defined as

.. math::
c = 3 \times \frac{\text{number of triangles}}
{\text{number of connected triples}}

If weights are given, the following definition is used:

.. math::
c = \frac{\mathrm{Tr}{{\boldsymbol A}^3}}{\sum_{i\ne j}[{\boldsymbol A}^2]_{ij}},

where :math:\boldsymbol A is the weighted adjacency matrix, and it is
assumed that the weights are normalized, i.e. :math:A_{ij} \le 1.

The implemented algorithm runs in time :math:O(|V|\left<k^2\right>),
where :math:\left< k^2\right> is the second moment of the degree
distribution.

If enabled during compilation, this algorithm runs in parallel.

Examples
--------
>>> g = gt.collection.data["karate"]
>>> print(gt.global_clustering(g))
(0.2556818181..., 0.06314746595...)

References
----------
.. [newman-structure-2003] M. E. J. Newman, "The structure and function of
complex networks", SIAM Review, vol. 45, pp. 167-256, 2003,
:doi:10.1137/S003614450342480

"""

if g.is_directed():
g = GraphView(g, directed=False, skip_properties=True)
c = _gt.global_clustering(g._Graph__graph, _prop("e", g, weight))
if ret_counts:
return c[:2], c, c
else:
return c[:2]

[docs]def extended_clustering(g, props=None, max_depth=3, undirected=False):
r"""
Return the extended clustering coefficients for all vertices.

Parameters
----------
g : :class:~graph_tool.Graph
Graph to be used.
props : list of :class:~graph_tool.VertexPropertyMap objects (optional, default: None)
list of vertex property maps where results will be stored. If specified,
this parameter will also be the return value.
max_depth : int (optional, default: 3)
Maximum clustering order.
undirected : boolean (optional, default: False)
Calculate the *undirected* clustering coefficients, if graph is directed
(this option has no effect if the graph is undirected).

Returns
-------
prop : list of :class:~graph_tool.VertexPropertyMap objects
List of vertex properties containing the clustering coefficients.

--------
local_clustering: local clustering coefficient
global_clustering: global clustering coefficient
motifs: motif counting

Notes
-----
The extended clustering coefficient :math:c^d_i of order :math:d is
defined as

.. math::
c^d_i = \frac{\left|\right\{ \{u,v\}; u,v \in N_i | d_{G(V\setminus
\{i\})}(u,v) = d \left\}\right|}{{\left|N_i\right| \choose 2}},

where :math:d_G(u,v) is the shortest distance from vertex :math:u to
:math:v in graph :math:G, and

.. math::
N_i = \{v_j : e_{ij} \in E\}

is the set of out-neighbors of :math:i. According to the above
definition, we have that the traditional local clustering coefficient is
recovered for :math:d=1, i.e., :math:c^1_i = c_i.

The implemented algorithm runs in
:math:O(|V|\left<k\right>^{2+\text{max-depth}}) worst time, where
:math:\left< k\right> is the average out-degree.

If enabled during compilation, this algorithm runs in parallel.

Examples
--------
>>> g = gt.collection.data["karate"]
>>> clusts = gt.extended_clustering(g, max_depth=5)
>>> for i in range(0, 5):
...    print(gt.vertex_average(g, clusts[i]))
...
(0.5706384782076..., 0.05869813676256...)
(0.3260389360735..., 0.04810773205917...)
(0.0530678759917..., 0.01513061504691...)
(0.0061658977316..., 0.00310690511463...)
(0.0002162629757..., 0.00021305890271...)

References
----------
.. [abdo-clustering] A. H. Abdo, A. P. S. de Moura, "Clustering as a
measure of the local topology of networks", :arxiv:physics/0605235
"""

if g.is_directed() and undirected:
g = GraphView(g, directed=False)
if props is None:
props = []
for i in range(0, max_depth):
props.append(g.new_vertex_property("double"))
_gt.extended_clustering(g._Graph__graph,
[_prop("v", g, p) for p in props])
return props

[docs]def motifs(g, k, p=1.0, motif_list=None, return_maps=False):
r"""
Count the occurrence of k-size node-induced subgraphs (motifs). A tuple with
two lists is returned: the list of motifs found, and the list with their
respective counts.

Parameters
----------
g : :class:~graph_tool.Graph
Graph to be used.
k : int
number of vertices of the motifs
p : float or float list (optional, default: 1.0)
uniform fraction of the motifs to be sampled. If a float list is
provided, it will be used as the fraction at each depth
:math:[1,\dots,k] in the algorithm. See [wernicke-efficient-2006]_ for
more details.
motif_list : list of :class:~graph_tool.Graph objects, optional
If supplied, the algorithms will only search for the motifs in this list
(or isomorphisms).
return_maps : bool (optional, default False)
If True, a list will be returned, which provides for each motif graph a
list of vertex property maps which map the motif to its location in the
main graph.

Returns
-------
motifs : list of :class:~graph_tool.Graph objects
List of motifs of size k found in the Graph. Graphs are grouped
according to their isomorphism class, and only one of each class appears
in this list. The list is sorted according to in-degree sequence,
out-degree-sequence, and number of edges (in this order).
counts : list of ints
The number of times the respective motif in the motifs list was counted
vertex_maps : list of lists of :class:~graph_tool.VertexPropertyMap objects
List for each motif graph containing the locations in the main
graph. This is only returned if return_maps == True.

--------
motif_significance: significance profile of motifs
local_clustering: local clustering coefficient
global_clustering: global clustering coefficient
extended_clustering: extended (generalized) clustering coefficient

Notes
-----
This functions implements the ESU and RAND-ESU algorithms described in
[wernicke-efficient-2006]_.

If enabled during compilation, this algorithm runs in parallel.

Examples
--------
.. testcode::
:hide:

np.random.seed(42)
gt.seed_rng(42)

>>> g = gt.random_graph(1000, lambda: (5,5))
>>> motifs, counts = gt.motifs(gt.GraphView(g, directed=False), 4)
>>> print(len(motifs))
11
>>> print(counts)
[116386, 392916, 443, 507, 2574, 1124, 741, 5, 5, 8, 2]

References
----------
.. [wernicke-efficient-2006] S. Wernicke, "Efficient detection of network
motifs", IEEE/ACM Transactions on Computational Biology and
Bioinformatics (TCBB), Volume 3, Issue 4, Pages 347-359, 2006.
:doi:10.1109/TCBB.2006.51
.. [induced-subgraph-isomorphism] http://en.wikipedia.org/wiki/Induced_subgraph_isomorphism_problem
"""

sub_list = []
directed_motifs = g.is_directed()

if motif_list is not None:
directed_motifs = motif_list.is_directed()
for m in motif_list:
if m.is_directed() != directed_motifs:
raise ValueError("all motif graphs must be either directed or undirected!")
if m.num_vertices() != k:
raise ValueError("all motifs must have the same number of vertices: %d" % k)
sub_list.append(m._Graph__graph)

if directed_motifs != g.is_directed():
raise ValueError("motifs do not have the same directionality as the graph itself!")

if type(p) == float:
pd = [1.0] * (k - 1)
pd.append(p)
if type(p) == list:
pd = [float(x) for x in p]

hist = []
vertex_maps = []
was_directed = g.is_directed()
_gt.get_motifs(g._Graph__graph, k, sub_list, hist, vertex_maps,
return_maps,  pd, True, len(sub_list) == 0,
_get_rng())

# assemble graphs
temp = []
for m in sub_list:
mg = Graph()
mg._Graph__graph = m
mg.reindex_edges()
temp.append(mg)
sub_list = temp

if return_maps:
list_hist = list(zip(sub_list, hist, vertex_maps))
else:
list_hist = list(zip(sub_list, hist))
# sort according to in-degree sequence
list_hist.sort(key=lambda x: sorted([v.in_degree() for v in x.vertices()]))

# sort according to out-degree sequence
list_hist.sort(key=lambda x: sorted([v.out_degree() for v in x.vertices()]))

# sort according to ascending number of edges
list_hist.sort(key=lambda x: x.num_edges())

sub_list = [x for x in list_hist]
hist = [x for x in list_hist]

if return_maps:
vertex_maps = [x for x in list_hist]
for i, vlist in enumerate(vertex_maps):
sub = sub_list[i]
vertex_maps[i] = [VertexPropertyMap(vm, sub) for vm in vlist]
return sub_list, hist, vertex_maps
return sub_list, hist

def _graph_sig(g):
"""return the graph signature, i.e., the in and out degree distribution as
concatenated as a tuple."""
bins = list(range(0, g.num_vertices() + 1))
in_dist = vertex_hist(g, "in", bins=bins if g.is_directed() else [0, 1],
float_count=False)
out_dist = vertex_hist(g, "out", bins=bins, float_count=False)
sig = tuple([(in_dist[i], in_dist[i]) for \
i in range(len(in_dist))] +
[(out_dist[i], out_dist[i]) for\
i in range(len(out_dist))])
return sig

[docs]def motif_significance(g, k, n_shuffles=100, p=1.0, motif_list=None,
threshold=0, self_loops=False, parallel_edges=False,
full_output=False, shuffle_model="configuration"):
r"""
Obtain the motif significance profile, for subgraphs with k vertices. A
tuple with two lists is returned: the list of motifs found, and their
respective z-scores.

Parameters
----------
g : :class:~graph_tool.Graph
Graph to be used.
k : int
Number of vertices of the motifs
n_shuffles : int (optional, default: 100)
Number of shuffled networks to consider for the z-score
p : float or float list (optional, default: 1.0)
Uniform fraction of the motifs to be sampled. If a float list is
provided, it will be used as the fraction at each depth
:math:[1,\dots,k] in the algorithm. See [wernicke-efficient-2006]_ for
more details.
motif_list : list of :class:~graph_tool.Graph objects (optional, default: None)
If supplied, the algorithms will only search for the motifs in this list
(isomorphisms)
threshold : int (optional, default: 0)
If a given motif count is below this level, it is not considered.
self_loops : bool (optional, default: False)
Whether or not the shuffled graphs are allowed to contain self-loops
parallel_edges : bool (optional, default: False)
Whether or not the shuffled graphs are allowed to contain parallel
edges.
full_output : bool (optional, default: False)
If set to True, three additional lists are returned: the count
of each motif, the average count of each motif in the shuffled networks,
and the standard deviation of the average count of each motif in the
shuffled networks.
shuffle_model : string (optional, default: "configuration")
Shuffle model to use. See :func:~graph_tool.generation.random_rewire
for details.

Returns
-------
motifs : list of :class:~graph_tool.Graph objects
List of motifs of size k found in the Graph. Graphs are grouped
according to their isomorphism class, and only one of each class appears
in this list. The list is sorted according to in-degree sequence,
out-degree-sequence, and number of edges (in this order).
z-scores : list of floats
The z-score of the respective motives. See below for the definition of
the z-score.

--------
motifs: motif counting or sampling
local_clustering: local clustering coefficient
global_clustering: global clustering coefficient
extended_clustering: extended (generalized) clustering coefficient

Notes
-----
The z-score :math:z_i of motif i is defined as

.. math::
z_i = \frac{N_i - \left<N^s_i\right>}
{\sqrt{\left<(N^s_i)^2\right> - \left<N^s_i\right>^2}},

where :math:N_i is the number of times motif i found, and :math:N^s_i
is the count of the same motif but on a shuffled network. It measures how
many standard deviations is each motif count, in respect to an ensemble of
randomly shuffled graphs with the same degree sequence.

The z-scores values are not normalized.

If enabled during compilation, this algorithm runs in parallel.

Examples
--------
>>> from numpy import random
>>> random.seed(10)
>>> g = gt.random_graph(100, lambda: (3,3))
>>> motifs, zscores = gt.motif_significance(g, 3)
>>> print(len(motifs))
12
>>> print(zscores)
[2.59252643351441, 2.5966529814390387, 2.3459237708258587, -1.0829180621127024, -1.3368754665984663, -2.33027728409781, -3.055817397993647, -0.1, -0.15, -0.19, -0.4, -0.01]

References
----------
.. [wernicke-efficient-2006] S. Wernicke, "Efficient detection of network
motifs", IEEE/ACM Transactions on Computational Biology and
Bioinformatics (TCBB), Volume 3, Issue 4, Pages 347-359, 2006.
:doi:10.1109/TCBB.2006.51
"""

s_ms, counts = motifs(g, k, p, motif_list)
if threshold > 0:
s_ms, counts = list(zip(*[x for x in zip(s_ms, counts) if x > threshold]))
s_ms = list(s_ms)
counts = list(counts)
s_counts =  * len(s_ms)
s_dev =  * len(s_ms)

# group subgraphs by number of edges
m_e = defaultdict(lambda: [])
for i in range(len(s_ms)):
m_e[_graph_sig(s_ms[i])].append(i)

# get samples
sg = g.copy()
for i in range(0, n_shuffles):
random_rewire(sg, model=shuffle_model, self_loops=self_loops,
parallel_edges=parallel_edges)
m_temp, count_temp = motifs(sg, k, p, motif_list)
if threshold > 0:
m_temp, count_temp = list(zip(*[x for x in zip(m_temp, count_temp) \
if x > threshold]))
for j in range(0, len(m_temp)):
found = False
for l in m_e[_graph_sig(m_temp[j])]:
if isomorphism(s_ms[l], m_temp[j]):
found = True
s_counts[l] += count_temp[j]
s_dev[l] += count_temp[j] ** 2
s_ms.append(m_temp[j])
s_counts.append(count_temp[j])
s_dev.append(count_temp[j] ** 2)
counts.append(0)
m_e[_graph_sig(m_temp[j])].append(len(s_ms) - 1)

s_counts = [x / float(n_shuffles) for x in s_counts]
s_dev = [max(sqrt(x / float(n_shuffles) - x ** 2), 1) \
for x in zip(s_dev, s_counts)]

list_hist = list(zip(s_ms, s_counts, s_dev))
# sort according to in-degree sequence
list_hist.sort(key = lambda x: sorted([v.in_degree() for v in x.vertices()])),

# sort according to out-degree sequence
list_hist.sort(key = lambda x: sorted([v.out_degree() for v in x.vertices()]))

# sort according to ascending number of edges
list_hist.sort(key = lambda x: x.num_edges())

s_ms, s_counts, s_dev = list(zip(*list_hist))

zscore = [(x - x) / x for x in zip(counts, s_counts, s_dev)]

if full_output:
return s_ms, zscore, counts, s_counts, s_dev
else:
return s_ms, zscore