Source code for clep.embedding.network_generator

# -*- coding: utf-8 -*-

"""Ensemble of methods for network generation."""
from itertools import combinations
from os import listdir
from os.path import isfile, join
from typing import TextIO, Optional, Tuple, Union, Set
import logging

import networkx as nx
import pandas as pd
from tqdm import tqdm

from clep.constants import VALUE_TO_COLNAME

logger = logging.getLogger(__name__)


[docs]def do_graph_gen( data: pd.DataFrame, network_gen_method: Optional[str] = 'interaction_network', gmt: Optional[str] = None, intersection_threshold: Optional[float] = 0.1, kg_data: Optional[pd.DataFrame] = None, folder_path: Optional[str] = None, jaccard_threshold: Optional[float] = 0.2, summary: bool = False, ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame, Set]]: """Generate patient-feature network given the data using a certain network generation method. :param data: Dataframe containing the patient-feature scores :param network_gen_method: Method to generate the patient-feature network :param gmt: Optional field for the path to the gmt file containing the pathway data :param intersection_threshold: Threshold to make edges in Pathway Overlap method :param kg_data: Optional field for the knowledge graph in edgelist format stored in a pandas dataframe :param folder_path: Optional field for the path to a folder containing multiple knowledge graphs :param jaccard_threshold: Threshold to make edges in Interaction Network Overlap method :param summary: Flag to indicate if the summary of the patient-feature network must be returned :return: Dataframe containing patient-feature network, and optionally the summary of the patient-feature network """ information_graph = nx.DiGraph() if network_gen_method == 'pathway_overlap': with open(gmt, 'r') as geneset: information_graph = plot_pathway_overlap(geneset, intersection_threshold) elif network_gen_method == 'interaction_network': interaction_graph = nx.from_pandas_edgelist( df=kg_data, source=kg_data.columns[0], target=kg_data.columns[2], edge_attr=kg_data.columns[1] ) if nx.number_connected_components(interaction_graph) > 1: logger.warning(f'The number of connected components in the graph is greater than 1. ' f'There are {nx.number_connected_components(interaction_graph)} connected components of size' f', {[len(c) for c in sorted(nx.connected_components(interaction_graph), key=len, reverse=True)]}' f' respectively.') information_graph = plot_interaction_network(kg_data) elif network_gen_method == 'interaction_network_overlap': information_graph = plot_interaction_net_overlap(folder_path, jaccard_threshold) if summary: final_graph, summary_data, linked_genes = overlay_samples(data, information_graph, summary=True) else: final_graph = overlay_samples(data, information_graph, summary=False) graph_df = nx.to_pandas_edgelist(final_graph) graph_df['relation'].fillna('no_change', inplace=True) graph_df = graph_df[['source', 'target', 'relation', 'label']] if summary: return graph_df, summary_data, linked_genes else: return graph_df
def plot_pathway_overlap( geneset: TextIO, intersection_threshold: float = 0.1 ) -> nx.DiGraph: """Plot the overlap/intersection between pathways as a graph based on shared genes.""" pathway_dict = { line.strip().split("\t")[0]: line.strip().split("\t")[2:] for line in geneset.readlines() } pathway_overlap_graph = nx.DiGraph() for pathway_1 in tqdm(pathway_dict.keys(), desc='Finding pathway overlap: '): for pathway_2 in pathway_dict.keys(): if pathway_1 == pathway_2: continue union = list(set().union(pathway_dict[pathway_1], pathway_dict[pathway_2])) intersection = list(set().intersection(pathway_dict[pathway_1], pathway_dict[pathway_2])) if len(intersection) > (intersection_threshold * len(union)): pathway_overlap_graph.add_edge(str(pathway_1), str(pathway_2)) return pathway_overlap_graph def plot_interaction_network( kg_data: pd.DataFrame ) -> nx.DiGraph: """Plot a knowledge graph based on the interaction data.""" interaction_graph = nx.DiGraph() # Append the source to target mapping to the main data edgelist for idx in tqdm(kg_data.index, desc='Plotting interaction network: '): interaction_graph.add_edge( str(kg_data.iat[idx, 0]), str(kg_data.iat[idx, 2]), relation=str(kg_data.iat[idx, 1]) ) return interaction_graph def plot_interaction_net_overlap( folder_path: str, jaccard_threshold: float = 0.2 ) -> nx.DiGraph: """Plot the overlap/intersection between interaction networks as a graph based on shared nodes.""" graphs = [] files = [ f for f in listdir(folder_path) if isfile(join(folder_path, f)) and f.endswith('.bel') ] # Get all the interaction network files from the folder and add them as individual graphs to a list for filename in tqdm(files, desc='Plotting interaction network: '): with open(join(folder_path, filename), 'r') as file: graph = nx.DiGraph(name=filename) for line in file: src, attr, dst = line.split() graph.add_edge(src, dst) graph[src][dst]['attribute'] = attr graphs.append(graph) overlap_graph = nx.DiGraph() for graph_1, graph_2 in tqdm(combinations(graphs, 2), desc='Finding interaction network overlap: '): if _get_jaccard_index(graph_1, graph_2) > jaccard_threshold: overlap_graph.add_edge(str(graph_1.graph['name']), str(graph_2.graph['name'])) return overlap_graph def _get_jaccard_index( graph_1: nx.DiGraph, graph_2: nx.DiGraph ) -> float: """Calculate the jaccard index between 2 graphs based on pairwise (edges) jaccard index.""" j = 0 iterations = 0 for v in graph_1: if v in graph_2: n = set(graph_1[v]) # neighbors of v in G m = set(graph_2[v]) # neighbors of v in H length_intersection = len(n & m) length_union = len(n) + len(m) - length_intersection j += float(length_intersection) / length_union iterations += 1 # To calculate the average return j / iterations def overlay_samples( data: pd.DataFrame, information_graph: nx.DiGraph, summary: bool = False, ) -> Union[nx.DiGraph, Tuple[nx.DiGraph, pd.DataFrame, Set]]: """Overlay the data onto the information graph by adding edges between patients and information nodes.""" patient_label_mapping = {patient: label for patient, label in zip(data.index, data['label'])} value_mapping = {0: 'no_change', 1: 'up_reg', -1: 'down_reg'} overlay_graph = information_graph.copy() data_copy = data.drop(columns='label') values_data = data_copy.values summary_data = pd.DataFrame(0, index=data_copy.index, columns=["positive_relation", "negative_relation"]) linked_genes = set() edges_to_remove = [] for index, value_list in enumerate(tqdm(values_data, desc='Adding patients to the network: ')): for column, value in enumerate(value_list): patient = data_copy.index[index] gene = data_copy.columns[column] # Avoid mangled duplicates from pandas if "." in gene: if gene.split(".")[0] in data_copy.columns: gene = gene.split(".")[0] # Ignore features with score of 0 if value == 0: continue # Skip if gene is not in the knowledge graph if gene in information_graph.nodes: if overlay_graph.has_edge(patient, gene): if overlay_graph.get_edge_data(patient, gene)['relation'] != value_mapping[value]: if (patient, gene) not in edges_to_remove: edges_to_remove.append((patient, gene)) continue linked_genes.add(gene) overlay_graph.add_edge(patient, gene, relation=value_mapping[value], label=patient_label_mapping[patient]) if summary: summary_data.at[patient, VALUE_TO_COLNAME[value]] += 1 # Remove patient-gene triples that have conflicting duplicates in the data for patient, gene in edges_to_remove: logger.warning(f"{patient}-{gene} triple is being discarded due to conflicting data") overlay_graph.remove_edge(patient, gene) if summary: non_conn_pats = summary_data[(summary_data['positive_relation'] == 0) & (summary_data['negative_relation'] == 0)] if len(non_conn_pats) > 0: logger.warning(f'{len(non_conn_pats)} samples is/are not connected to any genes.') return overlay_graph, summary_data, linked_genes else: return overlay_graph