library(plyr)
library(dplyr)
library(data.table)
library(ggplot2)
library(pheatmap)
library(methods)
library(grid)
library(reshape2)
library(nlme)
library(stringr)
library(psych)
library(tibble)
library(cluster)

library(metautils)
library(ithi.meta)
library(clonecluster)
library(paperutils)

Colour palettes

pal_patient <- select_palette("patient")

Parameters

db_path <- snakemake@params$db
cluster_label_file <- snakemake@input$cluster_labels
## For sample labels and plotting
data_matrix_file <- snakemake@input$data_matrix

Metadata

db <- src_sqlite(db_path, create = FALSE)
samples <- collect(tbl(db, "samples"))

Clusters

cluster_labels <- fread(cluster_label_file)
data_matrix <- fread(data_matrix_file)

mat <- as.matrix(cluster_labels)
rownames(mat) <- data_matrix$condensed_id

cluster_pct <- mat/rowSums(mat, na.rm = TRUE)
remove_ind <- apply(cluster_pct, 1, function(x) all(is.na(x)))
cluster_pct <- cluster_pct[!remove_ind, ]

Number of samples: 56.

Selecting k

cluster_counts <- 2:7
kmeans_results <- lapply(cluster_counts, function(k) {
    kmeans(cluster_pct, centers = k, iter.max = 500, nstart = 50)
})
silwidths <- sapply(kmeans_results, function(x) {
    mean(silhouette(x$cluster, dist(cluster_pct))[, "sil_width"])
})
withinss <- sapply(kmeans_results, function(x) {
    sum(x$withinss)
})

clustdf <- data.frame(nclust = cluster_counts, silhouette = silwidths, withinss = withinss)
clustdf_melted <- melt(clustdf, id.vars = "nclust", measure.vars = c("silhouette", 
    "withinss"), variable.name = "metric", value.name = "value")
ggplot(clustdf_melted, aes(x = nclust, y = value)) + geom_point() + theme_bw() + 
    theme_Publication() + facet_wrap(~metric, scales = "free")

It’s quite abundantly clear that 3 is the optimal number of clusters, by both the silhouette and elbow rule.

cluster_labels <- kmeans_results[[which(cluster_counts == 3)]]$cluster
cluster_labels_df <- rownames_to_column(data.frame(cluster_labels), "condensed_id")

Heatmap

data_matrix <- merge(data_matrix, cluster_labels_df, by = c("condensed_id"))
data_matrix$cluster_labels <- factor(data_matrix$cluster_labels)

data_matrix_subset <- subset(data_matrix, select = -c(condensed_id, cluster_labels))
datmat <- as.matrix(clip_values(scale(data_matrix_subset), 2, -2))
rownames(datmat) <- data_matrix$condensed_id

tmp <- scale(data_matrix_subset)
dists <- as.dist((1 - cor(tmp, use = "pairwise.complete"))/2)
hc <- hclust(dists, method = "ward.D")

annotation_row <- data.frame(data_matrix$cluster_labels)
rownames(annotation_row) <- data_matrix$condensed_id

annotation_colors <- list(cluster_labels = get_colour_palette(data_matrix$cluster_labels))

pheatmap(datmat[order(data_matrix$cluster_labels), hc$order], cluster_rows = FALSE, 
    cluster_cols = FALSE, annotation_row = annotation_row, annotation_colors = annotation_colors)

This doesn’t look that great …

Might have to do with the fact that the clonal analysis view isn’t accurate still (divergence will throw things off).

Plain hierarchical clustering

dists2 <- as.dist((1 - cor(t(tmp), use = "pairwise.complete"))/2)
hc2 <- hclust(dists2, method = "ward.D")
pheatmap(datmat[hc2$order, hc$order], cluster_rows = FALSE, cluster_cols = FALSE)

---
title: "Multi-view clustering"
output: 
  html_notebook:
    toc: true
    toc_depth: 5
    toc_float: true
params:
  rmd: "multiviewclustering.Rmd"
---
                        ```{r, echo=FALSE, message=FALSE, warning=FALSE}

######## Snakemake header ########
library(methods)
Snakemake <- setClass(
    "Snakemake",
    slots = c(
        input = "list",
        output = "list",
        params = "list",
        wildcards = "list",
        threads = "numeric",
        log = "list",
        resources = "list",
        config = "list",
        rule = "character"
    )
)
snakemake <- Snakemake(
    input = list('/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/clustering/ivmvnmf_clusts.csv', 'Rmd/multiviewclustering.Rmd', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/clustering/data_matrix.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', "cluster_labels" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/clustering/ivmvnmf_clusts.csv', "notebook" = 'Rmd/multiviewclustering.Rmd', "data_matrix" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/clustering/data_matrix.tsv', "site_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml'),
    output = list('/shahlab/alzhang/projects/ITH_Immune/paper/results/web/multiviewclustering.nb.html'),
    params = list('/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', 'ithi-analysis-multimodal-clustering', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "name" = 'ithi-analysis-multimodal-clustering'),
    wildcards = list(),
    threads = 1,
    log = list('/shahlab/alzhang/clusttmp/paperanalysis2/multimodal_clustering.log'),
    resources = list(),
    config = list("bcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "classifier_type" = 'dlda', "xcr_qc_notebook" = 'Rmd/replicates.Rmd', "phenotype_threshold" = 0.85, "ith_til_notebook" = 'Rmd/ith_til_densities.Rmd', "logdir" = '/shahlab/alzhang/clusttmp/paperanalysis2', "nanostring_signature_notebook" = 'Rmd/nanostring_signatures.Rmd', "notebook_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/web', "tils_for_cluster" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density'), "bcrphylo_examples_notebook" = 'Rmd/bcr_phylo_examples.Rmd', "subtype_marker_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/subtype_markers.tsv', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "v_dictionary" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/immtyper/metadata/imgt/Homo_sapiens_TRBV.fasta', "multiviewclustering_notebook" = 'Rmd/multiviewclustering.Rmd', "molsubtype_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "immtyper_lengths" = '11 12 13 14 15 16 17 18', "xcr_clones_notebook" = 'Rmd/xcr_clones_analysis.Rmd', "known_subtype_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/subtypes/known_subtypes.tsv', "tcr_clonotypes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/clonotypes/TRB_clonotypes_filtered.txt', "molsubtype_notebook" = 'Rmd/molecular_subtypes.Rmd', "ith_statistics_notebook" = 'Rmd/ith_statistics.Rmd', "library_sizes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/library_sizes.tsv', "j_dictionary" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/immtyper/metadata/imgt/Homo_sapiens_TRBJ.fasta', "site_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', "mvclust_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "immtyper_models" = '/shahlab/alzhang/projects/ITH_Immune/results/immtyper_results/klarenbeek/aa_vj/gradboost', "prevalence_threshold" = 0.01, "ihc_run1" = '/shahlab/alzhang/projects/ITH_Immune/data/ihc/cd8cd3cd20/validated_stats_weighted.rdata', "variability_type" = 'stabilize', "clone_branch_length_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/branch_data.tsv', "table_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2', "bcr_clonotypes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/clonotypes/IGH_clonotypes_filtered.txt', "intermediate_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2', "xcr_mapping_notebook" = 'Rmd/xcr_mapping.Rmd', "benchmarkdir" = '/shahlab/alzhang/benchmarks/paperanalysis2', "ith_stats_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/clonal_measures.tsv', "tcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', "example_annotations" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run13/final_partitions/ith2_2/clust9/annotations_flagged.tsv', "nanostring_annotations" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', "til_classifier_notebook" = 'Rmd/til_classifier.Rmd', "mvclust_nclust" = 3, "clone_prevalence_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/clone_data.tsv', "ihc_run2" = '/shahlab/alzhang/projects/ITH_Immune/data/ihc/cd79cd138cd68/validated_stats_weighted.rdata', "immune_variability_notebook" = 'Rmd/immune_variability.Rmd', "nanostring_data" = '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_tmm/normalized_expression_voa_labels.tsv', "xcr_distance_method" = 'horn', "nclusts" = 2, "example_msa_plot" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run13/old/alignment_plots/msa/ith2_2/clust9/indel_reversed.html', "clone_tree_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/tree_data.tsv', "tils_for_variability" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "spatial_result_dirs" = list("stromal" = '/shahlab/alzhang/pipeline_outputs/ith_immune/spatsim/ith5/abc', "epithelial" = '/shahlab/alzhang/pipeline_outputs/ith_immune/spatsim/ith3/abc'), "index_notebook" = 'Rmd/index.Rmd', "spatial_notebook" = 'Rmd/spatial_analysis.Rmd'),
    rule = 'multimodal_clustering'
)
######## Original script #########

                        ```


```{r global_chunk_options, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, tidy=TRUE, warning=FALSE, message=FALSE)
```


```{r}
library(plyr)
library(dplyr)
library(data.table)
library(ggplot2)
library(pheatmap)
library(methods)
library(grid)
library(reshape2)
library(nlme)
library(stringr)
library(psych)
library(tibble)
library(cluster)

library(metautils)
library(ithi.meta)
library(clonecluster)
library(paperutils)
```

## Colour palettes

```{r}
pal_patient <- select_palette("patient")
```

## Parameters

```{r}
db_path <- snakemake@params$db
cluster_label_file <- snakemake@input$cluster_labels
## For sample labels and plotting
data_matrix_file <- snakemake@input$data_matrix
```

## Metadata

```{r}
db <- src_sqlite(db_path, create=FALSE)
samples <- collect(tbl(db, "samples"))
```

## Clusters

```{r}
cluster_labels <- fread(cluster_label_file)
data_matrix <- fread(data_matrix_file)

mat <- as.matrix(cluster_labels)
rownames(mat) <- data_matrix$condensed_id

cluster_pct <- mat/rowSums(mat, na.rm = TRUE)
remove_ind <- apply(cluster_pct, 1, function(x) all(is.na(x)))
cluster_pct <- cluster_pct[!remove_ind,]
```

Number of samples: `r nrow(mat)`. 

### Selecting k

```{r}
cluster_counts <- 2:7
kmeans_results <- lapply(cluster_counts, function(k) {
  kmeans(cluster_pct, centers = k, iter.max = 500, nstart=50)
})
silwidths <- sapply(kmeans_results, function(x) {
  mean(silhouette(x$cluster, dist(cluster_pct))[,"sil_width"])
})
withinss <- sapply(kmeans_results, function(x) {
  sum(x$withinss)
})

clustdf <- data.frame(nclust=cluster_counts, silhouette=silwidths, withinss=withinss)
clustdf_melted <- melt(clustdf, id.vars = "nclust", measure.vars = c("silhouette", "withinss"), variable.name = "metric",
     value.name = "value")

```

```{r}
ggplot(clustdf_melted, aes(x=nclust, y=value)) + geom_point() + theme_bw() + theme_Publication() + facet_wrap(~ metric, scales= "free")
```


It's quite abundantly clear that 3 is the optimal number of clusters, by both the silhouette and elbow rule. 

```{r}
cluster_labels <- kmeans_results[[which(cluster_counts == 3)]]$cluster
cluster_labels_df <- rownames_to_column(data.frame(cluster_labels), "condensed_id")
```

### Heatmap

```{r, fig.width=12, fig.height=6}
data_matrix <- merge(data_matrix, cluster_labels_df, by=c("condensed_id"))
data_matrix$cluster_labels <- factor(data_matrix$cluster_labels)

data_matrix_subset <- subset(data_matrix, select=-c(condensed_id, cluster_labels))
datmat <- as.matrix(clip_values(scale(data_matrix_subset), 2, -2))
rownames(datmat) <- data_matrix$condensed_id

tmp <- scale(data_matrix_subset)
dists <- as.dist((1-cor(tmp, use = "pairwise.complete"))/2)
hc <- hclust(dists, method = "ward.D")

annotation_row <- data.frame(data_matrix$cluster_labels)
rownames(annotation_row) <- data_matrix$condensed_id

annotation_colors <- list(cluster_labels=get_colour_palette(data_matrix$cluster_labels))

pheatmap(datmat[order(data_matrix$cluster_labels), hc$order], cluster_rows = FALSE, cluster_cols = FALSE, annotation_row = annotation_row, annotation_colors = annotation_colors)
```

This doesn't look that great ...

Might have to do with the fact that the clonal analysis view isn't accurate still (divergence will throw things off). 

## Plain hierarchical clustering

```{r}
dists2 <- as.dist((1-cor(t(tmp), use = "pairwise.complete"))/2)
hc2 <- hclust(dists2, method = "ward.D")
pheatmap(datmat[hc2$order,hc$order], cluster_rows = FALSE, cluster_cols = FALSE)
```


<!-- ## t-SNE -->

<!-- ```{r} -->
<!-- dat <- data_matrix[apply(data_matrix, 1, function(x)!any(is.na(x))),] -->

<!-- patients <- map_id(dat$condensed_id, from = "condensed_id", to="patient_id", db_path) -->
<!-- ecb <- function(x,y){plot(x,t='n');text(x,labels=patients,col=get_colour_palette(patients)[patients])} -->

<!-- tsne_dat <- tsne(scale(dat[,2:ncol(dat)]), epoch_callback = ecb, perplexity = 2, whiten = TRUE, max_iter = 5000) -->
<!-- ``` -->