I load the packages I use the most while trying to avoid namespace conflicts. Any others are used with the :: syntax.
# Load packages in order of increasing importance
library(magrittr)
library(data.table)
library(matrixStats)
library(biomaRt)
library(NMF)
library(DESeq2)
library(maftools)
library(forcats)
library(tidyverse)Below are the paths to the files that will be used in this report.
paths <- list()
# Metadata
paths$patient_md <- project("data/metadata/patient_metadata.tsv")
paths$patient_icgc_md <- project("data/metadata/patient_metadata.icgc.tsv")
paths$genome_md <- project("data/metadata/genome_metadata.bams.tsv")
paths$genome_icgc_md <- project("data/metadata/genome_metadata.icgc.tsv")
paths$mrna_md <- project("data/metadata/mrna_metadata.bams.tsv")
paths$mirna_md <- project("data/metadata/mirna_metadata.bams.tsv")
paths$tn <- project("data/metadata/tumour_nuclei_percent.tsv")
# Reference
paths$bl_genes <- project("etc/bl.genes.txt")
paths$mbl_genes <- project("etc/mbl_signature.genes.txt")
paths$morgan_genes <- project("etc/morgan.genes.txt")
paths$wright_genes <- project("etc/wright.genes.txt")
paths$malaria_genes <- project("etc/malaria.genes.txt")
paths$latency_genes <- project("etc/latency.genes.txt")
paths$seq <- project("ref/GRCh38/Sequence")
paths$seq_grch37 <- project("ref/GRCh37-lite/Sequence")
paths$annot <- project("ref/GRCh38/Annotation")
paths$genome <- file.path(paths$seq, "WholeGenomeFasta/genome.simple_headers.fa")
paths$trans <- file.path(paths$seq, "GencodeTranscriptome/25")
paths$tx2gene <- file.path(paths$trans, "gencode_v25_and_NC_007605.tx2gene.tsv")
paths$gaps <- file.path(paths$annot, "ExcludedRegions/gaps.bed")
paths$genome_grch37 <- file.path(paths$seq_grch37, "WholeGenomeFasta/genome.simple_headers.fa")
# Results
paths$ebv_dna <- project("results/status/ebv_status.dna.txt")
paths$ebv_rna <- project("results/status/ebv_status.rna.txt")
paths$sex <- project("results/status/sex_status.txt")
paths$strelka <- project("results/strelka/3-tabulate")
paths$tc <- file.path(paths$strelka, "tumour_content.tsv")
paths$tc_icgc <- file.path(paths$strelka, "icgc.tumour_content.tsv")
paths$maf <- file.path(paths$strelka, "strelka.noncanonical.qss_filt.vaf_filt.aug.maf")
paths$maf_all <- file.path(paths$strelka, "all.strelka.canonical.qss_filt.aug.maf")
paths$mmaf <- project("results/smgs/data/BL_META.grch37.maf")
paths$smgs <- project("results/smgs/results/meta_analysis/BL_META.genes.txt")
paths$sequenza <- project("results/sequenza/2-sequenza/force_tc")
paths$gistic <- project("results/gistic")
paths$manta <- project("results/manta/1-manta/*/results/variants/somaticSV.vcf.gz")
paths$salmon <- project("results/salmon")
paths$mirna <- project("results/gsc_mirna/mirna_expression.tsv")We define variables that are generally useful below.
# Set RNG seed for reproducibility
rng_seed <- 87510475 # Generated by runif(1, 0, 10^8)
set.seed(rng_seed)
# Use all available cores when multithreading
num_cores <- parallel::detectCores()
doMC::registerDoMC(cores = num_cores)
BiocParallel::register(BiocParallel::MulticoreParam(workers = num_cores))
# Effective genome size
genome_size <- 2934876451
# Number of most variably expressed genes
ntop <- 1000
# Other options
options(scipen=10)