Analysis of Spatial Transcriptomic data

GeneCodeR analysis of Spatial Transcriptomic data

Imaging tissue contains genes spatially organised by the extracellular matrix, individual cell types and groups of cell compartments. Using linear algebra and vectors, the expected patterns of the histology images are learned (example patterns of (1): extracellular matrix stained in eosin, and (2): nuclei of cells stained in hematoxylin). Assigning each sample image to a learned pattern (by the Generative Encoder algorithm) is similar to assigning an image weight to a pattern vector. Accordingly, genes are also assigned weights to the same set of expected image patterns. This encodes the information into the same space.

The library that wraps around the Generative Encoder algorithm (gcode) is called GeneCodeR and provides a more comprehensive interface to load data from path, run gcode, extract relevant transformations, and validate the learned patterns.

# Main libraries for analysis
library(GeneCodeR)
library(gcode)

# Main libraries for plotting
library(ggplot2)
library(grid)
library(gridExtra)

Please replace these paths

main_path <- "~/Documents/main_files/AskExplain/Q4_2022/gcode/"

# Please replace these paths
path_to_save <- paste(main_path,"./temp_save_dir/",sep="")
path_to_work <- paste(main_path,"./temp_work_dir/",sep="")

Download data and format

dir.create(path_to_save)

## Warning in dir.create(path_to_save): '/Users/dbanh_side/Documents/main_files/
## AskExplain/Q4_2022/gcode/./temp_save_dir' already exists

dir.create(path_to_work)

## Warning in dir.create(path_to_work): '/Users/dbanh_side/Documents/main_files/
## AskExplain/Q4_2022/gcode/./temp_work_dir' already exists

setwd(path_to_work)
curl::curl_download(url = "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/29ntw7sh4r-5.zip", destfile = paste(path_to_work,"he_et_al.zip",sep=""))
utils::unzip(zipfile = paste(path_to_work,"he_et_al.zip",sep=""), exdir = path_to_work)
path_to_work <- paste(path_to_work,"29ntw7sh4r-5",sep="")

setwd(path_to_work)

main_files <- list.files(path_to_work, full.names = T)
new_files <- gsub(pattern = "BC", replacement = "BT", x = main_files)
file.rename(from = main_files, to = new_files)

##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [136] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [151] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [166] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [196] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [211] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [226] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [241] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [256] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [271] TRUE TRUE TRUE

GeneCodeR takes in three input files from a generic Spatial Transcriptomic dataset, per tissue slide:

1 - coordinates 2 - gene expression 3 - images

# Input list of files

file_path_list <- GeneCodeR::extract_path_framework(F)

file_path_list$meta$path <- list.files(path_to_work,pattern = "*Coords.tsv.gz",full.names = T)
file_path_list$coord$path <- list.files(path_to_work,pattern = "*csv.gz",full.names = T)
file_path_list$gex$path <- list.files(path_to_work,pattern = "*stdata.tsv.gz",full.names = T)
file_path_list$pixel$path <- list.files(path_to_work,pattern = "*jpg",full.names = T)

set.seed(1)
train_ids <- sample(c(1:length(file_path_list$coord$path)),round(length(file_path_list$coord$path)*0.9,0))
test_ids <- c(1:length(file_path_list$coord$path))[-train_ids]

train_file_path_list <- lapply(file_path_list,function(X){
  list(path=X$path[train_ids])
})
test_file_path_list <- lapply(file_path_list,function(X){
  list(path=X$path[-train_ids])
})

The GeneCodeR configuration file takes in several parameters.

The first set of parameters is information to extract spots - such as spot size, and any augmentations such as displacement of the coordinates, or rotation of the image.

The second set of parameters is for after the gcode model is learned (via GeneCodeR wrapper). This is to transfrom from one modality, to another modality (for example, from image to gene expression).

# Set up genecoder configuration parameters

genecoder.config <- GeneCodeR::extract_config_framework(F)

The meta information file takes in several parameters per coordinate, gene expression and image.

The first set of parameters is to read in the file. Here, example parameters such as the file type (for example, is it a .csv, .tsv, .jpg, .png, market matrix file (.mtx)?), and other important parameters based on file type (for example, does it contain a header, what is the separator …).

Importantly, another set of parameters is the factor identifier in the coordinate file - which columns contain the x-coordinates, y-coordinates and the label identifier for the spots? For example, the label identifiers, are given by the rownames indicated by a 0.

# Set up meta information

meta_info_list <- GeneCodeR::extract_meta_framework(F)
meta_info_list$meta$read_file$file_type <- "tsv"
meta_info_list$coord$read_file$file_type <- "csv"
meta_info_list$gex$read_file$file_type <- "tsv"
meta_info_list$pixel$read_file$file_type <- "image"

meta_info_list$meta$read_file$meta$sep <- "\t"
meta_info_list$coord$read_file$meta$header <- meta_info_list$gex$read_file$meta$header <- meta_info_list$meta$read_file$meta$header <- T
meta_info_list$coord$read_file$meta$sep <- ","
meta_info_list$gex$read_file$meta$sep <- "\t"
meta_info_list$meta$read_file$meta$row.names <- 1
meta_info_list$coord$read_file$meta$quote <- meta_info_list$gex$read_file$meta$quote <- meta_info_list$meta$read_file$meta$quote <- ""
meta_info_list$coord$read_file$meta$row.names <- 1
meta_info_list$gex$read_file$meta$row.names <- 1

meta_info_list$coord$factor_id$labels <- 0
meta_info_list$coord$factor_id$coord_x <- 1
meta_info_list$coord$factor_id$coord_y <- 2

To run the analysis appropriately, a set of common genes are required for all spots.

common_genes <- lapply(c(1:length(file_path_list$gex$path)),function(X){
  print(X)
  colnames(GeneCodeR::read_file(path = file_path_list$gex$path[X], meta_info = meta_info_list$gex$read_file)[[1]])
})

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
## [1] 38
## [1] 39
## [1] 40
## [1] 41
## [1] 42
## [1] 43
## [1] 44
## [1] 45
## [1] 46
## [1] 47
## [1] 48
## [1] 49
## [1] 50
## [1] 51
## [1] 52
## [1] 53
## [1] 54
## [1] 55
## [1] 56
## [1] 57
## [1] 58
## [1] 59
## [1] 60
## [1] 61
## [1] 62
## [1] 63
## [1] 64
## [1] 65
## [1] 66
## [1] 67
## [1] 68

meta_info_list$gex$factor$common_genes <- Reduce("intersect",common_genes)

Coordinate labels also need to be extracted.

meta_info_list$coord$factor$labels <- lapply(c(1:length(train_file_path_list$coord$path)),function(X){
  row.names(GeneCodeR::read_file(path = train_file_path_list$coord$path[X], meta_info = meta_info_list$coord$read_file)[[1]])
})

Gene expression data is extracted using the path list, meta information, and configuration parameters.

# Extract gex data
gex_data <- GeneCodeR::prepare_gex(file_path_list = train_file_path_list,meta_info_list = meta_info_list,config = genecoder.config)

## [1] "Extracting gex"
## [1] "Preparing gex      1"
## [1] "Preparing gex      2"
## [1] "Preparing gex      3"
## [1] "Preparing gex      4"
## [1] "Preparing gex      5"
## [1] "Preparing gex      6"
## [1] "Preparing gex      7"
## [1] "Preparing gex      8"
## [1] "Preparing gex      9"
## [1] "Preparing gex      10"
## [1] "Preparing gex      11"
## [1] "Preparing gex      12"
## [1] "Preparing gex      13"
## [1] "Preparing gex      14"
## [1] "Preparing gex      15"
## [1] "Preparing gex      16"
## [1] "Preparing gex      17"
## [1] "Preparing gex      18"
## [1] "Preparing gex      19"
## [1] "Preparing gex      20"
## [1] "Preparing gex      21"
## [1] "Preparing gex      22"
## [1] "Preparing gex      23"
## [1] "Preparing gex      24"
## [1] "Preparing gex      25"
## [1] "Preparing gex      26"
## [1] "Preparing gex      27"
## [1] "Preparing gex      28"
## [1] "Preparing gex      29"
## [1] "Preparing gex      30"
## [1] "Preparing gex      31"
## [1] "Preparing gex      32"
## [1] "Preparing gex      33"
## [1] "Preparing gex      34"
## [1] "Preparing gex      35"
## [1] "Preparing gex      36"
## [1] "Preparing gex      37"
## [1] "Preparing gex      38"
## [1] "Preparing gex      39"
## [1] "Preparing gex      40"
## [1] "Preparing gex      41"
## [1] "Preparing gex      42"
## [1] "Preparing gex      43"
## [1] "Preparing gex      44"
## [1] "Preparing gex      45"
## [1] "Preparing gex      46"
## [1] "Preparing gex      47"
## [1] "Preparing gex      48"
## [1] "Preparing gex      49"
## [1] "Preparing gex      50"
## [1] "Preparing gex      51"
## [1] "Preparing gex      52"
## [1] "Preparing gex      53"
## [1] "Preparing gex      54"
## [1] "Preparing gex      55"
## [1] "Preparing gex      56"
## [1] "Preparing gex      57"
## [1] "Preparing gex      58"
## [1] "Preparing gex      59"
## [1] "Preparing gex      60"
## [1] "Preparing gex      61"
## [1] "Done preparation!"

Importantly, labels for the meta information from the gene expression data extraction is required for extracting relevant spots that contain gene expression greater than background.

# Store meta data
meta_info_list$gex$factor$labels <- gex_data$labels


# Extract spot data

genecoder.config$extract_spots$window_size <- 30

spot_data <- GeneCodeR::prepare_spot(file_path_list = train_file_path_list,meta_info_list = meta_info_list,config = genecoder.config, gex_data = gex_data$gex)

## [1] "Extracting spots"
## [1] "Preparing spot      1"
## [1] "Preparing spot      2"
## [1] "Preparing spot      3"
## [1] "Preparing spot      4"
## [1] "Preparing spot      5"
## [1] "Preparing spot      6"
## [1] "Preparing spot      7"
## [1] "Preparing spot      8"
## [1] "Preparing spot      9"
## [1] "Preparing spot      10"
## [1] "Preparing spot      11"
## [1] "Preparing spot      12"
## [1] "Preparing spot      13"
## [1] "Preparing spot      14"
## [1] "Preparing spot      15"
## [1] "Preparing spot      16"
## [1] "Preparing spot      17"
## [1] "Preparing spot      18"
## [1] "Preparing spot      19"
## [1] "Preparing spot      20"
## [1] "Preparing spot      21"
## [1] "Preparing spot      22"
## [1] "Preparing spot      23"
## [1] "Preparing spot      24"
## [1] "Preparing spot      25"
## [1] "Preparing spot      26"
## [1] "Preparing spot      27"
## [1] "Preparing spot      28"
## [1] "Preparing spot      29"
## [1] "Preparing spot      30"
## [1] "Preparing spot      31"
## [1] "Preparing spot      32"
## [1] "Preparing spot      33"
## [1] "Preparing spot      34"
## [1] "Preparing spot      35"
## [1] "Preparing spot      36"
## [1] "Preparing spot      37"
## [1] "Preparing spot      38"
## [1] "Preparing spot      39"
## [1] "Preparing spot      40"
## [1] "Preparing spot      41"
## [1] "Preparing spot      42"
## [1] "Preparing spot      43"
## [1] "Preparing spot      44"
## [1] "Preparing spot      45"
## [1] "Preparing spot      46"
## [1] "Preparing spot      47"
## [1] "Preparing spot      48"
## [1] "Preparing spot      49"
## [1] "Preparing spot      50"
## [1] "Preparing spot      51"
## [1] "Preparing spot      52"
## [1] "Preparing spot      53"
## [1] "Preparing spot      54"
## [1] "Preparing spot      55"
## [1] "Preparing spot      56"
## [1] "Preparing spot      57"
## [1] "Preparing spot      58"
## [1] "Preparing spot      59"
## [1] "Preparing spot      60"
## [1] "Preparing spot      61"
## [1] "Done preparation!"

Relevant frameworks are set up to begin learning the Generative Encoder model.

Firstly, set up the data list, which stores the gene expression and spot data.

# Set up data

data_list <- list(spot_data$spot,as.matrix(spot_data$gex))

The join framework is set up, creating the model structure that learns (1): a similar function for “all” datasets, or (2): separate functions across gene expression and image pixel spots.

Here, only the incode and the beta parameters are set up to learn the individual patterns between genes and images. Given the samples are extracted from the same tissue site (image and genes at the same spot), the sample patterns are shared. The feature patterns and the sample patterns (see first paragraph - genes assigned to expected image patterns of hematoxylin and eosin stains) are mapped to a shared latent space via the alpha and beta codes. For example, the use of image patterns is analogous to pattern matching to the example image spot, the latent codes un-rotate any example image spots until it best fits the image pattern.

# Set up join information

join <- gcode::extract_join_framework(F)
join$complete$data_list <- c(1,2)
join$complete$alpha_sample <- c(1,1)
join$complete$beta_sample <- c(1,2)
join$complete$alpha_signal <- c(1,1)
join$complete$beta_signal <- c(1,2)
join$complete$code <- c(1,1)
join$covariance <- c(0,0)

reference <- gcode::extract_references_framework(F)
reference$data_list <- c(1)

Set up configuration information - the dimensions of the encoding space for the samples (i_dim), and the features (j_dim), as well as the dimensions of the latent code (k_dim) are set up. The initialisation for these parameters are set up via the irlba library using partial Singular Value Decomposition.

# Set up gcode config
gcode.config <- gcode::extract_config(F)
gcode.config$init <- list(alpha_sample="irlba",beta_sample="irlba")
gcode.config$i_dim <- 500
gcode.config$j_dim <- 500
gcode.config$dimension_reduction <- F
gcode.config$max_iter <- 100
gcode.config$seed <- 1
gcode.config$tol <- 1
gcode.config$n.cores <- 5
gcode.config$learn_rate <- 1e-1
gcode.config$batch_size <- 500

Run the Generative Encoder model

# Set up gcode model

genecoder.model <- GeneCodeR::learn_model(data_list = data_list, config = gcode.config, join = join, reference = reference)

## [1] "Initialising alpha_sample with : irlba"
## [2] "Initialising beta_sample with : irlba"

## Warning in irlba::irlba(as.matrix(x), nu = config$i_dim, maxit = 1): did not
## converge--results might be invalid!; try increasing work or maxit

## Warning in irlba::irlba(as.matrix(x), nv = config$j_dim, maxit = 1): did not
## converge--results might be invalid!; try increasing work or maxit

## Warning in irlba::irlba(as.matrix(x), nv = config$j_dim, maxit = 1): did not
## converge--results might be invalid!; try increasing work or maxit

## [1] "Beginning gcode learning with:    Sample dimension reduction (config$i_dim): 500    Feature dimension reduction (config$j_dim): 500    Latent invariant dimension (config$k_dim):     Tolerance Threshold: 1   Maximum number of iterations: 100   Verbose: TRUE"
## [1] "Iteration: 1 with Tolerance of: 21.1634805067891"
## [1] "Iteration: 2 with Tolerance of: 63.4720802714958"
## [1] "Iteration: 3 with Tolerance of: 8.75341078830188"
## [1] "Iteration: 4 with Tolerance of: 82.3722229279707"
## [1] "Iteration: 5 with Tolerance of: 0.272863785887901"
## [1] "Learning has converged for gcode, beginning (if requested) dimension reduction"
## [1] "Done! Total runtime of   43.044757048289"

Now begins the testing validation phase…

First extract the test set of the coordinate factor labels.

# Evaluate on test set

meta_info_list$coord$factor$labels <- lapply(c(1:length(test_file_path_list$coord$path)),function(X){
  row.names(GeneCodeR::read_file(path = test_file_path_list$coord$path[X], meta_info = meta_info_list$coord$read_file)[[1]])
})

Next extract the test set of the gene expression data.

# Extract test gex data
test_gex_data <- GeneCodeR::prepare_gex(file_path_list = test_file_path_list,meta_info_list = meta_info_list,config = genecoder.config)

## [1] "Extracting gex"
## [1] "Preparing gex      1"
## [1] "Preparing gex      2"
## [1] "Preparing gex      3"
## [1] "Preparing gex      4"
## [1] "Preparing gex      5"
## [1] "Preparing gex      6"
## [1] "Preparing gex      7"
## [1] "Done preparation!"

meta_info_list$gex$factor$labels <- test_gex_data$labels

Save relevant information, including the model, the test meta information, the test file list and the test gene expression data. This is to analyse results later.

base_test_spot_data <- GeneCodeR::prepare_spot(file_path_list = test_file_path_list,meta_info_list = meta_info_list,config = genecoder.config, gex_data = test_gex_data$gex)

## [1] "Extracting spots"
## [1] "Preparing spot      1"
## [1] "Preparing spot      2"
## [1] "Preparing spot      3"
## [1] "Preparing spot      4"
## [1] "Preparing spot      5"
## [1] "Preparing spot      6"
## [1] "Preparing spot      7"
## [1] "Done preparation!"

non_zero_markers <- base_test_spot_data$gex>0


save.image(file = paste(sep="",path_to_save,"all_genecoder.RData"))

help@askexplain.com

2022-12-18

GeneCodeR analysis of Spatial Transcriptomic data