Source code for tigerlily.pagerank

"""Personalized PageRank computation with TigerGraph."""

import json
import typing
from typing import Dict, List

import pandas as pd
import pyTigerGraph as tg  # noqa: N813
import requests
from tqdm.notebook import tqdm

[docs]class PersonalizedPageRankMachine: """Define a drug-protein graph and compute the Personalized PageRank of nodes.""" def __init__(self, host: str, graphname: str, username: str, secret: str, password: str): """Set up a Personalized PageRank computation machine. :param host: Address of the TigerGraph host. :param graphname: Name of the Graph used for analytics. :param username: The username for the grapgh :param secret: The secret generated in TigerGraph Studio. :param password: The password of the user. """ self._host = host self._graphname = graphname self._username = username self._secret = secret self._password = password
[docs] def connect(self): """Connect to the host with the authentication details.""" token_getter = tg.TigerGraphConnection(host=self._host, graphname=self._graphname) token = token_getter.getToken(self._secret, "12000")[0] self.connection = tg.TigerGraphConnection( host=self._host, graphname=self._graphname, username=self._username, password=self._password, apiToken=token )
def _purge_graph(self): """Delete the ecisting drug and gene type nodes.""" self.connection.delVertices("drug") self.connection.delVertices("gene") def _upload_relationship(self, edges: pd.DataFrame, source: str, target: str, edge_type: str = "interacts"): """Given an edge dataframe uploading the edges corresponding to specific source and target types. :param edges: Edges dataframe of interest. :param source: Source node type. :param target: Target node type. :param edge_type: The type of edges. """ sub_edges = edges[(edges["type_1"] == source) & (edges["type_2"] == target)] sub_edges = [(edge[0], edge[1], {}) for edge in sub_edges[["node_1", "node_2"]].values.tolist()] self.connection.upsertEdges(source, edge_type, target, sub_edges)
[docs] def upload_graph(self, new_graph: bool, edges: pd.DataFrame): """ Uploadthe edges from a dataframe using the PyTigerGraph connection. :param new_graph: Decision about deleting the existing nodes in the graph. :param edges: The dataframe with the edges between drugs and proteins. """ assert "type_1" in edges.columns and "type_2" in edges.columns assert "node_1" in edges.columns and "node_2" in edges.columns if new_graph: self._purge_graph() self._upload_relationship(edges, "drug", "gene", "interacts") self._upload_relationship(edges, "gene", "gene", "interacts") self._upload_relationship(edges, "gene", "drug", "interacts")
[docs] def install_query( self, url: str = "", # noqa:E501 ): """Install a query on the host. :param url: A url to the query string. """ script = requests.get(url).text script = script.replace("CREATE QUERY", "CREATE OR REPLACE QUERY") self.connection.gsql(script) self.connection.gsql("INSTALL QUERY ALL")
[docs] @typing.no_type_check def personalized_pagerank( self, node_id: str, node_type: str = "drug", edge_type: str = "interacts", print_accum: bool = True, damping: float = 0.85, iterations: int = 20, top_k: int = 40, ) -> Dict: """Compute the pagerank for a specific node. :param node_id: Identifier of the node of interest. :param node_type: Type of the node. :param edge_type: Type of the edge. :param print_accum: Accumulation flag. :param damping: Non return probability. :param iterations: Number of steps per walk. :param top_k: Number of closest neighbors to return for the query. :returns: Personalized PageRank nodes for a specific node in the Graph. """ params = {} params["source"] = [{"type": node_type, "id": node_id}] params["e_type"] = edge_type params["print_accum"] = print_accum params["damping"] = damping params["iter"] = iterations params["top_k"] = top_k query = "RUN QUERY tg_pagerank_pers(" + json.dumps(params) + ")" response = self.connection.gsql(query) response = json.loads(response) return response
[docs] def get_personalized_pagerank( self, node_ids: List, edge_type: str = "interacts", print_accum: bool = True, damping: float = 0.5, iterations: int = 100, top_k: int = 100, ) -> pd.DataFrame: """ Compute the pruned Personalized PageRank for a list of nodes. :param node_ids: Identifiers of the nodes of interest. :param edge_type: Type of the node. :param print_accum: Accumulation flag. :param damping: Non return probability. :param iterations: Number of steps per walk. :param top_k: Number of closest neighbors to return for the query. :returns: A table of node pairs with PageRank scores. """ all_scores = [] for node_id in tqdm(node_ids): scores = self.personalized_pagerank( node_id["v_id"], node_id["v_type"], edge_type, print_accum, damping, iterations, top_k ) scores = scores["results"][0]["top_scores"] scores = [[node_id["v_id"], edge["vertex_id"], edge["score"]] for edge in scores] scores = pd.DataFrame(scores, columns=["node_1", "node_2", "score"]) all_scores.append(scores) all_scores = pd.concat(all_scores) return all_scores