{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e0301277",
   "metadata": {},
   "source": [
    "# Patient Params - Edit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0dea2935",
   "metadata": {},
   "outputs": [],
   "source": [
    "# @CLININFO - Edit any patient specific data in this cell.\n",
    "#  Should work with any standard POG / PROFYLE\n",
    "#   slow run ~ 10 min / gene full plots\n",
    "ID = \"POG1218\"\n",
    "full_plot_genes = []\n",
    "full_plot_genes += [\"HTR1A\", \"MYC\"]\n",
    "\n",
    "checkpoint_genes = [\"CD274\", \"CTLA4\"]\n",
    "# full_plot_genes += checkpoint_genes\n",
    "\n",
    "# leave blank to be filled by api defaults (later)\n",
    "disease_comps = []  # disease_comps = api.get_disease_comparators(lib)\n",
    "tissue_comps = []  # tissue_comps = api.get_gtex_tissue_comparators(lib)\n",
    "kb_disease_match = [\n",
    "]  # kb_disease_match = api.get_oncotree_comparator_name(lib)\n",
    "lib = []\n",
    "\n",
    "MATCH_ALL_GENES = False  # look for other expression outliers besides the 'full_plot_genes'\n",
    "TCGA_PLOT = True\n",
    "GTEX_PLOT = True\n",
    "\n",
    "val_units = \"tpm\"\n",
    "assembly_id = \"hg38\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cadce8bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Some standard python imports\n",
    "import glob\n",
    "import importlib\n",
    "from math import log2\n",
    "import numpy as np\n",
    "import os\n",
    "import pandas as pd\n",
    "from pprint import pprint\n",
    "import requests\n",
    "from scipy import stats\n",
    "\n",
    "from logzero import logger, logfile"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b78efeb4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# plotting imports\n",
    "# !pip install seaborn\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db6b8f1f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# GSC imports\n",
    "from gsc_genomic_report.genome_process.measures import expression_categories\n",
    "from gsc_genomic_report.genome_process.measures.expression_categories import (\n",
    "    URL,\n",
    "    default_disease_id,\n",
    "    default_gtex_id,\n",
    "    expression_categories_from_microservice,\n",
    "    get_vardbmicro_matrices,\n",
    "    get_disease_matrices,\n",
    "    get_gtex_matrices,\n",
    "    ipr_columns,\n",
    "    normalize_expression_columns,\n",
    ")\n",
    "from gsc_genomic_report.ipr_upload import (\n",
    "    GKB_PASSWORD_KEY,\n",
    "    PasswordManager,\n",
    ")\n",
    "\n",
    "from ipr.main import create_report as ipr_report\n",
    "\n",
    "importlib.reload(\n",
    "    expression_categories)  # Force reload of module while developing"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bbe3b1f6",
   "metadata": {},
   "source": [
    "# Bioapps Patient Parameters\n",
    "Set the ID to find libraries and RSEM expression file.\n",
    "\n",
    "First we just find the expression data from the patient and load it into a table."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a0e97c02",
   "metadata": {},
   "outputs": [],
   "source": [
    "api = expression_categories.BioappsApi(\n",
    ")  # bioapps api for libraries, comparators, etc.\n",
    "api.get_lib_names(ID)  # all the libraries of the id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e32d7361",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not lib:\n",
    "    rna = api.get_tumour_transcriptome_libraries(ID)\n",
    "    rna_libs = list(rna.keys())\n",
    "    if len(rna_libs) > 1:\n",
    "        logger.warning(f\"Using {ID} RNA lib {rna_libs[0]} - ignoring alts {rna_libs[1:]}\")\n",
    "    lib = rna_libs[0]\n",
    "\n",
    "logger.info(f\"Getting RSEM path of {lib}\")\n",
    "rsem_dir = api.get_lib_rsem_filepath(lib)[0]\n",
    "rsem_fn = glob.glob(rsem_dir + \"/*genes.results\")[0]\n",
    "logger.info(rsem_fn)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "73e70e4f",
   "metadata": {},
   "source": [
    "## RSEM File"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "88decc7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "logfilename = os.path.abspath(f\"{ID}.{lib}.log\")\n",
    "print(f\"Saving logs to: {logfilename}\")\n",
    "logfile(logfilename)\n",
    "logger.info(f\"Found {ID} {lib} RSEM: {rsem_fn}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a5f084ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "rsem = pd.read_csv(rsem_fn, sep=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f2cd0232",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove strange chromosomes like 'chrUn_KI270442v1' 'chr14_GL000225v1_random', etc.\n",
    "std_chroms = [\n",
    "    chrom for chrom in rsem['chr_name'].unique()\n",
    "    if not chrom.endswith('random') and not chrom.startswith(\"chrUn\")\n",
    "]\n",
    "print(std_chroms)\n",
    "rsem = rsem[rsem['chr_name'].isin(std_chroms)]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "470ab502",
   "metadata": {},
   "source": [
    "## RSEM table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac4cbbff",
   "metadata": {},
   "outputs": [],
   "source": [
    "rsem"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bfa4cc95",
   "metadata": {},
   "source": [
    "# Get comparators & Matrix names\n",
    "\n",
    "Find the relevant comparators from the bioapps api base on the RNA library name.\n",
    "\n",
    "Given the comparators, find the relevant vardb tables to decide expression outlier categories."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7523509",
   "metadata": {},
   "source": [
    "## Comparators from bioapps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a5ab59e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# for kb_disease_match\n",
    "if not kb_disease_match:\n",
    "    kb_disease_match = api.get_oncotree_comparator_name(lib)\n",
    "    logger.info(f\"found {lib} kb_disease_match {kb_disease_match}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d1460c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not disease_comps:\n",
    "    disease_comps = api.get_disease_comparators(lib)\n",
    "    logger.info(f\"found {lib} disease_comps {disease_comps}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "676820aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not tissue_comps:\n",
    "    tissue_comps = api.get_gtex_tissue_comparators(lib)\n",
    "    logger.info(f\"found {lib} tissue_comps {tissue_comps}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b94811ae",
   "metadata": {},
   "source": [
    "## vardbmicro matrix finding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d7b3da57",
   "metadata": {},
   "outputs": [],
   "source": [
    "# help(get_vardbmicro_matrices)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf446b5f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# help(get_disease_matrices)\n",
    "# help(default_disease_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63818196",
   "metadata": {},
   "outputs": [],
   "source": [
    "matrices = get_vardbmicro_matrices()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3bd6012",
   "metadata": {},
   "outputs": [],
   "source": [
    "# RSEM val_units=\"tpm\" and assembly_id=\"hg38\"\n",
    "disease_matrix_id = default_disease_id(disease_comps[0], val_units=val_units, assembly_id=assembly_id)\n",
    "logger.info(f\"Disease comparator matrix_id {disease_matrix_id} from {disease_comps[0]}\")\n",
    "for alt_comp in disease_comps[1:]:\n",
    "    alt_id = default_disease_id(alt_comp, val_units=val_units, assembly_id=assembly_id)\n",
    "    logger.info(f\"alt disease comparator {alt_id} from {alt_comp}\")\n",
    "\n",
    "get_disease_matrices(matrices, disease_matrix_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f446e788",
   "metadata": {},
   "outputs": [],
   "source": [
    "tissue_matrix_id = default_gtex_id(tissue_comps[0], val_units, assembly_id=assembly_id)\n",
    "logger.info(f\"Tissue comparator matrix_id {tissue_matrix_id} from {tissue_comps[0]}\")\n",
    "for alt_comp in tissue_comps[1:]:\n",
    "    alt_id = default_gtex_id(alt_comp, val_units, assembly_id=assembly_id)\n",
    "    logger.info(f\"alt Tissue comparator {alt_id} from {alt_comp}\")\n",
    "get_gtex_matrices(matrices, tissue_matrix_id)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b6b43151",
   "metadata": {},
   "source": [
    "# Expression Categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "605b3b76",
   "metadata": {},
   "outputs": [],
   "source": [
    "# uncomment to print help __doc__\n",
    "# help(expression_categories_from_microservice)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a67e35bb",
   "metadata": {},
   "source": [
    "Load all the RSEM data and apply expression outlier categories.\n",
    "This will take 5-10 mins."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81525358",
   "metadata": {},
   "outputs": [],
   "source": [
    "# @CLININFO - Experimental expression outlier calculation - microvardb & RSEM TPM data\n",
    "\n",
    "logger.info(\"Finding 'increased expression' and 'reduced expression' \"\n",
    "            f\"from tissue {tissue_comps} and disease {disease_matrix_id}\")\n",
    "\n",
    "match_exp_data = None\n",
    "if MATCH_ALL_GENES:\n",
    "    match_exp_data = rsem_fn\n",
    "elif full_plot_genes:\n",
    "    match_exp_data = normalize_expression_columns(rsem, genes=full_plot_genes)\n",
    "\n",
    "if match_exp_data is None:\n",
    "    logger.info(\"skipping expression outlier matching - no data\")\n",
    "else:\n",
    "    logger.info(f\"Expression data from: {rsem_fn}\")\n",
    "    if MATCH_ALL_GENES:\n",
    "        logger.info(\"...~ 10K genes per minute\")\n",
    "    else:\n",
    "        logger.info(f\"\\only matching genes: {full_plot_genes}\")\n",
    "    exp = expression_categories_from_microservice(\n",
    "        match_exp_data,\n",
    "        disease_expression_comp_list=[disease_matrix_id],\n",
    "        gtex_comps=tissue_comps,\n",
    "    )\n",
    "    logger.info('finished expression categories')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82d4935a",
   "metadata": {},
   "outputs": [],
   "source": [
    "exp.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3402f8cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# @CLININFO - Show some basic statistics about the expression categories\n",
    "cats = sorted(exp.kbCategory.unique())\n",
    "logger.info(f\"Of {len(exp)} genes:\")\n",
    "for category in cats:\n",
    "    logger.info(\n",
    "        f\"\\t{len(exp[exp['kbCategory'] == category])} genes of kbCategory: '{category}'\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09ab7a01",
   "metadata": {},
   "outputs": [],
   "source": [
    "# @CLININFO - The 'expressionState' is just a descriptive label at the moment\n",
    "logger.info(f\"Of {len(exp)} genes:\")\n",
    "for category in sorted(exp.expressionState.unique()):\n",
    "    logger.info(\n",
    "        f\"\\t{len(exp[exp['expressionState'] == category])}\\t genes of expressionState: '{category}'\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abf6457b",
   "metadata": {},
   "outputs": [],
   "source": [
    "ipr_columns(exp)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8507bd41",
   "metadata": {},
   "source": [
    "# GraphKb Match through IPR\n",
    "\n",
    "Requires the expression with disease categories\n",
    "\n",
    "Currently GraphKB understands to look for genes where 'increased expression' or 'reduced expression' category is relevant to the cancer."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef731199",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add some minimal report details for matching using GraphKB\n",
    "report_details = dict()\n",
    "report_details['patient_id'] = ID\n",
    "report_details[\"interactive\"] = True\n",
    "\n",
    "report_details[\"kb_disease_match\"] = kb_disease_match[0]\n",
    "\n",
    "report_details[\"optional_content\"] = {\"comparators\": []}\n",
    "if not disease_comps:\n",
    "    disease_comps = api.get_disease_comparators(lib)\n",
    "if not tissue_comps:\n",
    "    tissue_comps = api.get_gtex_tissue_comparators(lib)\n",
    "\n",
    "logger.info(f\"disease_comps {disease_comps}\")\n",
    "logger.info(f\"tissue_comps {tissue_comps}\")\n",
    "\n",
    "comp = {\"analysisRole\": \"expression (disease)\", \"name\": disease_comps[0]}\n",
    "report_details[\"optional_content\"][\"comparators\"].append(comp)\n",
    "comp = {\"analysisRole\": \"expression (primary site)\", \"name\": tissue_comps[0]}\n",
    "report_details[\"optional_content\"][\"comparators\"].append(comp)\n",
    "comp = {\"analysisRole\": \"expression (biopsy site)\", \"name\": tissue_comps[-1]}\n",
    "report_details[\"optional_content\"][\"comparators\"].append(comp)\n",
    "\n",
    "report_details[\"optional_content\"][\"template\"] = \"genomic\"\n",
    "\n",
    "pprint(report_details)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87a562bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "logger.info(\"Starting Graphkb matching / IPR upload\")\n",
    "logger.info(\"Finding graphkb/IPR username and password\")\n",
    "login = PasswordManager()\n",
    "report_details[\"username\"] = login.get_user()\n",
    "report_details[\"password\"] = login.get_password(GKB_PASSWORD_KEY)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b10eb32d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# help(ipr_report)\n",
    "# exp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b58455c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Format expression catgories table into a list of dicts with appropriate columns for matching\n",
    "ipr_exp = []\n",
    "for i, row in ipr_columns(exp).iterrows():\n",
    "    ipr_exp.append(row.fillna(\"\").to_dict())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d3c3dd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# example output - 'kbCategory' could be set to 'increased expression' or 'reduced expression' for matching\n",
    "ipr_exp[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9a736910",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Match to IPR; this will take a couple of minutes\n",
    "report_details[\"expression_variant_rows\"] = ipr_exp\n",
    "ipr_json = ipr_report(**report_details)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3a2b3000",
   "metadata": {},
   "outputs": [],
   "source": [
    "ipr_json.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8342bbc3",
   "metadata": {},
   "outputs": [],
   "source": [
    "pprint(ipr_json['genomicAlterationsIdentified'])\n",
    "expression_variants = sorted(\n",
    "    [g['geneVariant'] for g in ipr_json['genomicAlterationsIdentified']])\n",
    "altered_genes = sorted([\n",
    "    g['geneVariant'].split()[0]\n",
    "    for g in ipr_json['genomicAlterationsIdentified']\n",
    "])\n",
    "\n",
    "print(altered_genes)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "89e880f8",
   "metadata": {},
   "source": [
    "## Clinically Relevant Expression Outliers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79fb4abe",
   "metadata": {},
   "outputs": [],
   "source": [
    "# @CLININFO - matched expression outliers\n",
    "logger.info(\n",
    "    f\"Significant {kb_disease_match[0]} expression outliers compared to tissue {tissue_comps[0]} and disease {disease_comps[0]}\"\n",
    ")\n",
    "logger.info(\n",
    "    f\"{len(altered_genes)} from {len(ipr_json['expressionVariants'])} expression variants.\"\n",
    ")\n",
    "for exp_var in expression_variants:\n",
    "    logger.info(exp_var)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e9a350bc",
   "metadata": {},
   "source": [
    "## Show knowledgebase match details for the gene expression outliers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26813352",
   "metadata": {},
   "outputs": [],
   "source": [
    "logger.info(\n",
    "    f\"Found {len(ipr_json['kbMatches'])} GraphKb matches to {altered_genes}.  Some examples\")\n",
    "ipr_json['kbMatches'][:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5f45c945",
   "metadata": {},
   "source": [
    "# vardb matrix values\n",
    "\n",
    "Find the list of expression values from vardb.\n",
    "Require vpn connection."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c963d059",
   "metadata": {},
   "outputs": [],
   "source": [
    "cols = [\n",
    "    'gene_name', 'kbCategory', 'FPKM', 'tpm', 'diseasekIQR',\n",
    "    'diseasePercentile', 'primarySitekIQR', 'primarySitePercentile',\n",
    "    'expressionState', 'gene_id', 'gene_version', 'transcript_id(s)'\n",
    "]\n",
    "xp = exp[exp.gene_name.isin(altered_genes + full_plot_genes)][cols]\n",
    "xp"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8d0ec1aa",
   "metadata": {},
   "source": [
    "## Exp Outliers - Plot disease comparator and Gtex comparator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d23291e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "logger.info(f\"tissue_matrix {tissue_matrix_id}\")\n",
    "tissue_url = f\"{URL}/matrices/{tissue_matrix_id}/genes\"\n",
    "logger.info(tissue_url)\n",
    "\n",
    "resp = requests.get(tissue_url).json()['result']\n",
    "print(resp['usage_notices'])\n",
    "tissue_genes = resp['records']\n",
    "tissue_genes[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f05b4594",
   "metadata": {},
   "outputs": [],
   "source": [
    "logger.info(f\"disease_matrix_id {disease_matrix_id}\")\n",
    "disease_url = f\"{URL}/matrices/{disease_matrix_id}/genes\"\n",
    "logger.info(disease_url)\n",
    "\n",
    "resp = requests.get(disease_url).json()['result']\n",
    "if 'usage_notices' in resp and resp['usage_notices']:\n",
    "    logger.warning(resp['usage_notices'])\n",
    "disease_genes = resp['records']\n",
    "disease_genes[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "829caf60",
   "metadata": {},
   "outputs": [],
   "source": [
    "def match_gene(row, gene_list):\n",
    "    \"\"\"Deal with different gene names like 'ENSG00000263911.1_hg19' vs 'C1orf68|100129271_hg19'\"\"\"\n",
    "    name_list = [\n",
    "        g['gene_id'] for g in gene_list\n",
    "        if g['gene_id'].startswith(row['gene_name'])\n",
    "        or g['gene_id'].startswith(row['gene_id'])\n",
    "    ]\n",
    "    if not name_list:\n",
    "        logger.error(f\"No match found for {row['gene_name']} {row['gene_id']}\")\n",
    "        return \"\"\n",
    "    elif len(name_list) > 1:\n",
    "        logger.warning(\n",
    "            f\"Multiple genes for {row['gene_name']} {row['gene_id']} - using {name_list[0]} from {name_list[1:]}\"\n",
    "        )\n",
    "    return name_list[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbd31da4",
   "metadata": {},
   "outputs": [],
   "source": [
    "xp[disease_matrix_id] = xp.apply(match_gene, gene_list=disease_genes, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba8eede4",
   "metadata": {},
   "outputs": [],
   "source": [
    "xp[tissue_matrix_id] = xp.apply(match_gene, gene_list=tissue_genes, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5effd3de",
   "metadata": {},
   "outputs": [],
   "source": [
    "xp.sort_values(\"gene_name\", inplace=True)\n",
    "xp.set_index(\"gene_name\", inplace=True, drop=False)\n",
    "xp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c41607a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from math import log10\n",
    "FUDGE = 0.001\n",
    "\n",
    "\n",
    "def get_exp_vals(gene, matrix, fudge=FUDGE):\n",
    "    \"\"\"Retrieve the values only, rounding by fudge.\n",
    "\n",
    "    Adding a non-zero fudge value stops log tranformations from -inf values.\n",
    "    Effectively makes the low detection limit cutoff.\n",
    "        eg. Adding 0.001 effectively makes anything below 1 transcript per billion as noise/within error of zero.\n",
    "            All genes express at least 1 transcript per thousand.\n",
    "    \"\"\"\n",
    "    if not matrix:\n",
    "        return []\n",
    "    url = f\"{URL}/matrices/{matrix}/genes/{gene}/expression-values\"\n",
    "    # return url\n",
    "    resp = requests.get(url).json()\n",
    "    if 'result' in resp:\n",
    "        expr = requests.get(url).json()['result']['records']\n",
    "        digits = abs(round(log10(fudge))) + 1\n",
    "        return [round(e['expression_value'] + fudge, digits) for e in expr]\n",
    "    logger.error(f\"No result for {gene} {matrix}: {url}\")\n",
    "    return []"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2b4dfd10",
   "metadata": {},
   "source": [
    "## Exp Outliers - old style Plots - disease and primary comparator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0089afba",
   "metadata": {},
   "outputs": [],
   "source": [
    "# @CLININFO - Detailed histograms of the GTEX and TCGA used to classify the expression as outlier\n",
    "MAX_PLOTS = 30\n",
    "figs = {}\n",
    "log2figs = {}\n",
    "gene_list = full_plot_genes + altered_genes\n",
    "for gene in gene_list[:MAX_PLOTS]:\n",
    "    disease_tpms = get_exp_vals(matrix=disease_matrix_id,\n",
    "                                gene=xp.loc[gene][disease_matrix_id])\n",
    "    logger.info(f\"{gene} {len(disease_tpms)} points from {disease_matrix_id}\")\n",
    "    logger.info(\n",
    "        f\"\\tmean {np.mean(disease_tpms):.3f}, median {np.median(disease_tpms):.3f}, std: {np.std(disease_tpms):.3f}\"\n",
    "    )\n",
    "    logger.info(\n",
    "        f\"\\tskew {stats.skew(disease_tpms):.3f}, kurtosis {stats.kurtosis(disease_tpms):.3f}\"\n",
    "    )\n",
    "\n",
    "    tissue_tpms = get_exp_vals(matrix=tissue_matrix_id,\n",
    "                               gene=xp.loc[gene][tissue_matrix_id])\n",
    "    logger.info(f\"{gene} {len(tissue_tpms)} points from {tissue_matrix_id}\")\n",
    "    logger.info(\n",
    "        f\"\\tmean {np.mean(tissue_tpms):.3f}, median {np.median(tissue_tpms):.3f}, std: {np.std(tissue_tpms):.3f}\"\n",
    "    )\n",
    "    logger.info(\n",
    "        f\"\\tskew {stats.skew(tissue_tpms):.3f}, kurtosis {stats.kurtosis(tissue_tpms):.3f}\"\n",
    "    )\n",
    "\n",
    "    # small table for ploting data\n",
    "    dt = pd.DataFrame(columns=['tpm', 'matrix'])\n",
    "    dt['tpm'] = tissue_tpms\n",
    "    dt['matrix'] = tissue_matrix_id\n",
    "    dd = pd.DataFrame(columns=['tpm', 'matrix'])\n",
    "    dd['tpm'] = disease_tpms\n",
    "    dd['matrix'] = disease_matrix_id\n",
    "    dt = pd.concat([dt, dd])\n",
    "\n",
    "    # hist plot\n",
    "    fig, ax = plt.subplots()\n",
    "    sns.histplot(dt,\n",
    "                 x=\"tpm\",\n",
    "                 hue=\"matrix\",\n",
    "                 kde=True,\n",
    "                 stat='probability',\n",
    "                 common_norm=False,\n",
    "                 common_bins=True,\n",
    "                 bins=20,\n",
    "                 ax=ax)\n",
    "    title = f\"{gene} {round(xp.loc[gene, 'tpm'], 1)} TPM\"\n",
    "    ax.set_title(title)\n",
    "    ax.axvline(xp.loc[gene, 'tpm'], color=\"r\", linestyle=\"--\")\n",
    "    figs[gene] = fig\n",
    "\n",
    "    # hist log2\n",
    "    fig, ax = plt.subplots()\n",
    "    sns.histplot(dt,\n",
    "                 x=\"tpm\",\n",
    "                 hue=\"matrix\",\n",
    "                 kde=True,\n",
    "                 stat='probability',\n",
    "                 common_norm=False,\n",
    "                 common_bins=True,\n",
    "                 log_scale=2,\n",
    "                 ax=ax)\n",
    "    title = f\"{gene} {round(xp.loc[gene, 'tpm'], 1)} TPM ~ (2**{round(log2(xp.loc[gene, 'tpm']), 3)})\"\n",
    "    ax.set_title(title)\n",
    "    ax.axvline(xp.loc[gene, 'tpm'], color=\"r\", linestyle=\"--\")\n",
    "    log2figs[gene] = fig"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9db4d5d3",
   "metadata": {},
   "source": [
    "# Full set comparisons - GERO-182"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2b07b55d",
   "metadata": {},
   "source": [
    "## Plot function helper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "32b65e5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "PLOT_STYLES = [\n",
    "    'violinboxen',\n",
    "    'violinplot',\n",
    "    'boxenplot',\n",
    "    'boxplot',\n",
    "]\n",
    "\n",
    "\n",
    "def plotter(style_code, data, xcol, ycol, title=None, vline=None):\n",
    "    \"\"\"Some standard styles for plotting.\n",
    "\n",
    "    Args:\n",
    "        style_code (str):\n",
    "            'violinboxen' - composit with boxen plot style\n",
    "            'violinplot' - standard violin plot\n",
    "            'boxenplot' - seaborn extended box plot\n",
    "            'boxplot' - standard seaborn boxplot\n",
    "    \"\"\"\n",
    "    if style_code == 'violinboxen':\n",
    "        ax = sns.violinplot(\n",
    "            data=data,\n",
    "            x=xcol,\n",
    "            y=ycol,\n",
    "            scale=\"width\",\n",
    "            inner=None,\n",
    "        )\n",
    "        ax = sns.boxenplot(\n",
    "            data=data,\n",
    "            x=x_val,\n",
    "            y=ycol,\n",
    "            width=0.3,\n",
    "            ax=ax,\n",
    "        )\n",
    "    elif style_code == 'violinplot':\n",
    "        ax = sns.violinplot(\n",
    "            data=data,\n",
    "            x=xcol,\n",
    "            y=ycol,\n",
    "            scale=\"width\",\n",
    "        )\n",
    "    elif style_code == 'boxenplot':\n",
    "        ax = sns.boxenplot(\n",
    "            data=data,\n",
    "            x=x_val,\n",
    "            y=ycol,\n",
    "        )\n",
    "    elif style_code == 'boxplot':\n",
    "        ax = sns.boxplot(\n",
    "            data=data,\n",
    "            x=x_val,\n",
    "            y=ycol,\n",
    "        )\n",
    "    elif style_code == 'test':\n",
    "        ax = sns.boxplot(\n",
    "            data=data,\n",
    "            x=x_val,\n",
    "            y=ycol,\n",
    "            width=0.1,\n",
    "        )\n",
    "        ax = sns.violinplot(\n",
    "            data=data,\n",
    "            x=xcol,\n",
    "            y=ycol,\n",
    "            scale=\"width\",\n",
    "            bw=0.3,\n",
    "            # inner=None,\n",
    "            # cut=0,\n",
    "            ax=ax,\n",
    "        )\n",
    "    else:\n",
    "        raise ValueError(f\"No style_code {style_code}\")\n",
    "    if vline is not None:\n",
    "        ax.axvline(\n",
    "            vline,\n",
    "            color=\"r\",\n",
    "        )\n",
    "    if title:\n",
    "        ax.set_title(title)\n",
    "    return ax"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1992e693",
   "metadata": {},
   "source": [
    "## Data Preparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "033d2947",
   "metadata": {},
   "outputs": [],
   "source": [
    "full_plot_genes += altered_genes\n",
    "full_plot_genes = sorted(set(full_plot_genes))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58db808c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Just keeping tpm matrices for comparison to RSEM\n",
    "\n",
    "xp = exp[exp['gene_name'].isin(full_plot_genes)].copy()\n",
    "xp.sort_values(by='gene_name', inplace=True)\n",
    "xp.set_index('gene_name', inplace=True, drop=False)\n",
    "xp['log2(tpm)'] = xp['tpm'].apply(lambda x: log2(x + FUDGE))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79df4097",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Map all gene names of full RSEM exp table\n",
    "logger.info(f\"Mapping full RSEM gene names for {disease_matrix_id}\")\n",
    "xp[disease_matrix_id] = xp.apply(match_gene, gene_list=disease_genes, axis=1)\n",
    "logger.info(f\"Mapping full RSEM gene names for {tissue_matrix_id}\")\n",
    "xp[tissue_matrix_id] = xp.apply(match_gene, gene_list=tissue_genes, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc7e05b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "xp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b379ee4",
   "metadata": {},
   "outputs": [],
   "source": [
    "matrices = [\n",
    "    m for m in get_vardbmicro_matrices()\n",
    "    if not m['deleted'] and m['expression_value_type'] == 'tpm'\n",
    "]\n",
    "len(matrices)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "da047f95",
   "metadata": {},
   "source": [
    "### GTEX Matrices for plotting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d7726045",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ideally we can group these by tissue / oncotree or something\n",
    "print(tissue_comps)\n",
    "GTEX_VER = 'v8'\n",
    "gtex_m_ids = sorted([\n",
    "    m['id'] for m in matrices\n",
    "    if m['id'].startswith('gtex_') and f\"_{GTEX_VER}_\" in m['id']\n",
    "])\n",
    "alt_gtex_m_ids = sorted([\n",
    "    m['id'] for m in matrices\n",
    "    if m['id'].startswith('gtex_') and f\"_{GTEX_VER}_\" not in m['id']\n",
    "])\n",
    "\n",
    "print(\"Other version GTEX\")\n",
    "pprint(alt_gtex_m_ids)\n",
    "\n",
    "print(\"\\n GTEX used:\")\n",
    "gtex_m_ids"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "588e6ed3",
   "metadata": {},
   "source": [
    "### TCGA Matrices for plotting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99470744",
   "metadata": {},
   "outputs": [],
   "source": [
    "tcga_m_ids = sorted([\n",
    "    m['id'] for m in matrices\n",
    "    if m['id'].startswith('tcga_') and \"_v10_\" in m['id']\n",
    "])\n",
    "tcga_m_ids"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6569b586",
   "metadata": {},
   "source": [
    "## TCGA Plot"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "856b7824",
   "metadata": {},
   "source": [
    "### TCGA Plot - data prep"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "581bf097",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Map v10 genes\n",
    "if 'tcga_gene' not in xp.columns:\n",
    "    url = f\"{URL}/matrices/{tcga_m_ids[0]}/genes\"\n",
    "    tcga_gene_list = [g for g in requests.get(url).json()['result']['records']]\n",
    "    xp['tcga_gene'] = xp.apply(match_gene, gene_list=tcga_gene_list, axis=1)\n",
    "xp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6be19a7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# @CLININFO - slow step - not optimized - several minutes per gene\n",
    "try:\n",
    "    print(len(tcga_data))\n",
    "except Exception as err:\n",
    "    tcga_data = {}\n",
    "\n",
    "if TCGA_PLOT:\n",
    "    logger.info(\n",
    "        f\"Loading data for {len(full_plot_genes)} genes from {len(tcga_m_ids)} TCGA tables\"\n",
    "    )\n",
    "\n",
    "    for gene in full_plot_genes:\n",
    "        if gene in tcga_data:\n",
    "            continue\n",
    "        comp_data = {}\n",
    "\n",
    "        for i, mid in enumerate(tcga_m_ids):\n",
    "            # logger.info(f\"loading{gene} {mid}\")\n",
    "            if mid in xp.columns:\n",
    "                gid = xp.loc[gene][mid]\n",
    "            elif 'tcga_gene' in xp.columns:\n",
    "                gid = xp.loc[gene]['tcga_gene']\n",
    "            comp_data[mid] = get_exp_vals(matrix=mid, gene=gid)\n",
    "            logger.info(f\"Loaded {len(comp_data[mid])} {gene} values from {mid} as {gid} ({i + 1}/{len(tcga_m_ids)})\")\n",
    "            # - statistics slow things significantly - a few minutes per gene to load all tables\n",
    "            desc = stats.describe(comp_data[mid])\n",
    "            logger.info(\n",
    "                f\"\\t{gene} {mid}: {desc.nobs} vals mean: {desc.mean:.3f} range {desc.minmax}\"\n",
    "                f\" var: {desc.variance:.3f} skew: {desc.skewness:.3f}\")\n",
    "        tcga_data[gene] = comp_data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8bbdfc6a",
   "metadata": {},
   "source": [
    "### TCGA Plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "751f7ed0",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# @CLININFO - Full TCGA plot dumps\n",
    "# Plot TCGA comparison set\n",
    "\n",
    "for gene in tcga_data.keys():\n",
    "    # organize the tcga data into a small table for plotting\n",
    "    comp_list = []\n",
    "    for mid in tcga_data[gene].keys():\n",
    "        dt = pd.DataFrame()\n",
    "        dt['tpm'] = tcga_data[gene][mid]\n",
    "        dt['comp'] = mid.replace(\"tcga_\", \"\").replace(\"_v10_tpm\", \"\")\n",
    "        comp_list.append(dt)\n",
    "    dt = pd.concat(comp_list)\n",
    "    dt['log2(tpm)'] = dt['tpm'].apply(log2)\n",
    "    title = f\"{gene} {round(xp.loc[gene, 'tpm'], 1)} TPM ~ (2**{round(xp.loc[gene, 'log2(tpm)'], 3)})\"\n",
    "    x_val = 'log2(tpm)'\n",
    "\n",
    "    for style in [\"test\"]:  # PLOT_STYLES:\n",
    "        fig_name = os.path.abspath(f\"{ID}.{lib}.{gene}.tcga_v10.{style}.png\")\n",
    "        logger.info(f\"Creating {os.path.basename(fig_name)}\")\n",
    "        plt.figure(figsize=(10, 24))  # Make the figure bigger\n",
    "\n",
    "        plotter(style_code=style,\n",
    "                data=dt,\n",
    "                xcol=x_val,\n",
    "                ycol='comp',\n",
    "                title=title,\n",
    "                vline=xp.loc[gene, x_val])\n",
    "\n",
    "        plt.savefig(fig_name)\n",
    "        logger.info(f\"Saved: {fig_name}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7cd4884",
   "metadata": {},
   "source": [
    "## GTEX Tissue plot"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "985613ea",
   "metadata": {},
   "source": [
    "### GTEX data prep"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29ffdf4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Map GTEX v8 genes\n",
    "if GTEX_VER == 'v8':\n",
    "    url = f\"{URL}/matrices/{gtex_m_ids[0]}/genes\"\n",
    "    v8_gene_list = [g for g in requests.get(url).json()['result']['records']]\n",
    "    xp['gtex_gene'] = xp.apply(match_gene, gene_list=v8_gene_list, axis=1)\n",
    "xp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8f8fd191",
   "metadata": {},
   "outputs": [],
   "source": [
    "# @CLININFO - slow step - not optimized - several minutes per gene\n",
    "try:\n",
    "    print(len(gtex_data))\n",
    "except Exception as err:\n",
    "    gtex_data = {}\n",
    "\n",
    "if GTEX_PLOT:\n",
    "    # Load gene expression values from GTEX set\n",
    "    logger.info(\n",
    "        f\"Loading GTEX data for {len(full_plot_genes)} genes from {len(gtex_m_ids)} matrices.\"\n",
    "    )\n",
    "    for gene in full_plot_genes:\n",
    "        if gene in gtex_data:\n",
    "            continue\n",
    "        comp_data = {}\n",
    "        for i, mid in enumerate(gtex_m_ids):\n",
    "            if mid in xp.columns:\n",
    "                gid = xp.loc[gene][mid]\n",
    "            elif 'gtex_gene' in xp.columns:\n",
    "                gid = xp.loc[gene]['gtex_gene']\n",
    "            comp_data[mid] = get_exp_vals(matrix=mid, gene=gid)\n",
    "            logger.info(f\"Loaded {len(comp_data[mid])} {gene} values from {mid} as {gid} ({i + 1}/{len(gtex_m_ids)})\")\n",
    "\n",
    "            desc = stats.describe(comp_data[mid])\n",
    "            logger.info(\n",
    "                f\"\\t{gene} {mid}: {desc.nobs} vals mean: {desc.mean:.3f} range {desc.minmax}\"\n",
    "                f\" var: {desc.variance:.3f} skew: {desc.skewness:.3f}\")\n",
    "        gtex_data[gene] = comp_data\n",
    "    logger.info(f\"Finished loading GTEX\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "08653731",
   "metadata": {},
   "source": [
    "### GTEX plots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ec21afd4",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# @CLININFO - Full GTEX plot dumps\n",
    "# GTEX comparison plot\n",
    "for gene in gtex_data.keys():\n",
    "    title = f\"{gene} {round(xp.loc[gene, 'tpm'], 1)} TPM ~ (2**{round(xp.loc[gene, 'log2(tpm)'], 3)})\"\n",
    "    # compile GTEX gene expression values into a small table for plotting\n",
    "    comp_list = []\n",
    "    for mid in gtex_data[gene].keys():\n",
    "        dt = pd.DataFrame()\n",
    "        dt['tpm'] = gtex_data[gene][mid]\n",
    "        dt['comp'] = mid.replace(\"gtex_\", \"\").replace(f\"_n_{GTEX_VER}_tpm\",\n",
    "                                                       \"\")\n",
    "        comp_list.append(dt)\n",
    "    dt = pd.concat(comp_list)\n",
    "    dt['log2(tpm)'] = dt['tpm'].apply(log2)\n",
    "    x_val = 'log2(tpm)'\n",
    "\n",
    "    for style in [\"test\"]:  # PLOT_STYLES:\n",
    "        fig_name = os.path.abspath(\n",
    "            f\"{ID}.{lib}.{gene}.gtex_{GTEX_VER}.{style}.png\")\n",
    "        logger.info(f\"Creating {os.path.basename(fig_name)}\")\n",
    "        plt.figure(figsize=(10, 30))  # Make the figure bigger\n",
    "\n",
    "        plotter(style_code=style,\n",
    "                data=dt,\n",
    "                xcol=x_val,\n",
    "                ycol='comp',\n",
    "                title=title,\n",
    "                vline=xp.loc[gene, x_val])\n",
    "\n",
    "        plt.savefig(fig_name)\n",
    "        logger.info(f\"Saved: {fig_name}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "340px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}