Chapter 1 Down the rabbit-hole

Javier Diaz - javier.diazmejia@gmail.com
Script based on: https://satijalab.org/seurat/articles/integration_introduction.html
GENERAL OVERVIEW OF THIS SCRIPT
1. Loads scRNA-seq data from Cell Ranger (MTX files)
2. Generates QC plots for each dataset
3. Normalizes each dataset
4. Saves R objects with each normalized dataset

1.0.1 DEFINE ENVIRONMENT, INPUTS AND OUTDIR

oldw <- getOption("warn")
options( warn = -1 )

1.0.1.0.0.1 Required libraries

library(Seurat)   # (CRAN) main scRNA-seq analysis package

## Attaching SeuratObject

1.0.1.0.0.2 User’s home and stopwatch to time run

UserHomeDirectory <- Sys.getenv("HOME")[[1]]
print(UserHomeDirectory)

## [1] "/Users/jdiazmej"

StopWatchStart <- list()
StopWatchEnd   <- list()

StopWatchStart$Overall  <- Sys.time()

1.0.1.0.0.3 Define inputs

### Select glioblastoma datasets from Richards_NatCancer_2021
DatasetIds <- list( ## List of input IDs to process
  `1` = "G1003_A_T",
  `2` = "G620_T",
  `3` = "G910_A_T",
  `4` = "G945_I_T",
  `5` = "G946_I_T",
  `6` = "G967_A_T",
  `7` = "G983_A_T"
)

### /path_to/MTX_DIRECTORIES from 10X Cell Ranger (i.e. 'filtered_feature_bc_matrix' directories
### with [features.tsv.gz, barcodes.tsv.gz and matrix.mtx.gz] files)
PathToDatasets <- list(
  "G1003_A_T" = "~/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G1003_A_T/",
  "G620_T"    = "~/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G620_T/",
  "G910_A_T"  = "~/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G910_A_T/",
  "G945_I_T"  = "~/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G945_I_T/",
  "G946_I_T"  = "~/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G946_I_T/",
  "G967_A_T"  = "~/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G967_A_T/",
  "G983_A_T"  = "~/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G983_A_T/"
)

1.0.1.0.0.4 Define outputs

### Outputs
PathForOutfiles <- "~/CourseData/CAN_data/Module7/OUTFILES/Richards_NatCancer_2021" ## /path_to/out_directory
PrefixOutfiles  <- "Richards_NatCancer_2021"  ## Prefix for outfiles
PathForOutfiles <- gsub("^~/",paste0(UserHomeDirectory,"/"), PathForOutfiles)
dir.create(path = PathForOutfiles, recursive = T, showWarnings = F)

1.0.1.0.0.5 Define default parameters

DefaultParameters <- list(
  
  ### Parameters for QC plots
  CellPropertiesToQC = c("nFeature_RNA", "nCount_RNA", "mito.fraction"),
  
  ### Parameters for Seurat filters
  MinCells = 1,
  MinGenes = 50,
  MaxReads = 80000,
  MaxMitoFraction = 0.2
)

1.0.1.0.0.6 Report R sessionInfo

OutfileRSessionInfo<-paste0(PathForOutfiles, "/", PrefixOutfiles, "_SingleCell_1_QC_Normalization_RSessionInfo.txt")
writeLines(capture.output(sessionInfo()), OutfileRSessionInfo)
capture.output(sessionInfo())

##  [1] "R version 4.0.2 (2020-06-22)"                                                                                           
##  [2] "Platform: x86_64-apple-darwin17.0 (64-bit)"                                                                             
##  [3] "Running under: macOS High Sierra 10.13.6"                                                                               
##  [4] ""                                                                                                                       
##  [5] "Matrix products: default"                                                                                               
##  [6] "BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib"
##  [7] "LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib"                                    
##  [8] ""                                                                                                                       
##  [9] "locale:"                                                                                                                
## [10] "[1] en_CA.UTF-8/en_CA.UTF-8/en_CA.UTF-8/C/en_CA.UTF-8/en_CA.UTF-8"                                                      
## [11] ""                                                                                                                       
## [12] "attached base packages:"                                                                                                
## [13] "[1] stats     graphics  grDevices utils     datasets  methods   base     "                                              
## [14] ""                                                                                                                       
## [15] "other attached packages:"                                                                                               
## [16] "[1] SeuratObject_4.0.1 Seurat_4.0.2      "                                                                              
## [17] ""                                                                                                                       
## [18] "loaded via a namespace (and not attached):"                                                                             
## [19] "  [1] Rtsne_0.15            colorspace_2.0-1      deldir_0.2-10         ellipsis_0.3.2       "                          
## [20] "  [5] ggridges_0.5.3        rstudioapi_0.13       spatstat.data_2.1-0   leiden_0.3.8         "                          
## [21] "  [9] listenv_0.8.0         ggrepel_0.9.1         fansi_0.5.0           codetools_0.2-18     "                          
## [22] " [13] splines_4.0.2         knitr_1.33            polyclip_1.10-0       jsonlite_1.7.2       "                          
## [23] " [17] packrat_0.6.0         ica_1.0-2             cluster_2.1.2         png_0.1-7            "                          
## [24] " [21] uwot_0.1.10           shiny_1.6.0           sctransform_0.3.2     spatstat.sparse_2.0-0"                          
## [25] " [25] compiler_4.0.2        httr_1.4.2            assertthat_0.2.1      Matrix_1.3-3         "                          
## [26] " [29] fastmap_1.1.0         lazyeval_0.2.2        later_1.2.0           htmltools_0.5.1.1    "                          
## [27] " [33] tools_4.0.2           igraph_1.2.6          gtable_0.3.0          glue_1.4.2           "                          
## [28] " [37] RANN_2.6.1            reshape2_1.4.4        dplyr_1.0.6           Rcpp_1.0.6           "                          
## [29] " [41] scattermore_0.7       jquerylib_0.1.4       vctrs_0.3.8           nlme_3.1-152         "                          
## [30] " [45] lmtest_0.9-38         xfun_0.23             stringr_1.4.0         globals_0.14.0       "                          
## [31] " [49] mime_0.10             miniUI_0.1.1.1        lifecycle_1.0.0       irlba_2.3.3          "                          
## [32] " [53] goftest_1.2-2         future_1.21.0         MASS_7.3-54           zoo_1.8-9            "                          
## [33] " [57] scales_1.1.1          spatstat.core_2.1-2   promises_1.2.0.1      spatstat.utils_2.1-0 "                          
## [34] " [61] parallel_4.0.2        RColorBrewer_1.1-2    yaml_2.2.1            reticulate_1.20      "                          
## [35] " [65] pbapply_1.4-3         gridExtra_2.3         ggplot2_3.3.3         sass_0.4.0           "                          
## [36] " [69] rpart_4.1-15          stringi_1.6.2         rlang_0.4.11          pkgconfig_2.0.3      "                          
## [37] " [73] matrixStats_0.59.0    evaluate_0.14         lattice_0.20-44       ROCR_1.0-11          "                          
## [38] " [77] purrr_0.3.4           tensor_1.5            patchwork_1.1.1       htmlwidgets_1.5.3    "                          
## [39] " [81] cowplot_1.1.1         tidyselect_1.1.1      parallelly_1.25.0     RcppAnnoy_0.0.18     "                          
## [40] " [85] plyr_1.8.6            magrittr_2.0.1        bookdown_0.22         R6_2.5.0             "                          
## [41] " [89] generics_0.1.0        DBI_1.1.1             mgcv_1.8-35           pillar_1.6.1         "                          
## [42] " [93] fitdistrplus_1.1-5    survival_3.2-11       abind_1.4-5           tibble_3.1.2         "                          
## [43] " [97] future.apply_1.7.0    crayon_1.4.1          KernSmooth_2.23-20    utf8_1.2.1           "                          
## [44] "[101] spatstat.geom_2.1-0   plotly_4.9.3          rmarkdown_2.8         grid_4.0.2           "                          
## [45] "[105] data.table_1.14.0     digest_0.6.27         xtable_1.8-4          tidyr_1.1.3          "                          
## [46] "[109] httpuv_1.6.1          munsell_0.5.0         viridisLite_0.4.0     bslib_0.2.5.1        "

1.0.2 LOAD DATASETS

####################################
###Load datasets from local (MTX files produced by Cell Ranger) and create Seurat objects
####################################

SeuratObjectsUnfiltered <-list()

for (datasetNumber in c(1:length(DatasetIds))) {
  dataset <- DatasetIds[[as.character(datasetNumber)]]
  DatasetPath <- PathToDatasets[[dataset]]
  DatasetPath<-gsub("^~/",paste0(UserHomeDirectory,"/"), DatasetPath)
  print(paste("Loading:", datasetNumber, dataset, DatasetPath, sep = " "))
  
  ### Create sparse matrix
  expression_matrix.mat <- Read10X(data.dir = DatasetPath, strip.suffix = T)
  colnames(expression_matrix.mat) <- paste(dataset, colnames(expression_matrix.mat), sep = "_") ### Add datasetID to cell-barcode
  dim(expression_matrix.mat)
  
  ### Create Seurat object
  seurat.object.u <- CreateSeuratObject(counts = expression_matrix.mat,
                                        min.cells = DefaultParameters$MinCells, 
                                        min.features = DefaultParameters$MinGenes, 
                                        project = paste0(PrefixOutfiles, "_", dataset)
                                        )
  seurat.object.u[['dataset.label']] <- dataset
  SeuratObjectsUnfiltered[[datasetNumber]]  <- seurat.object.u
}

## [1] "Loading: 1 G1003_A_T /Users/jdiazmej/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G1003_A_T/"
## [1] "Loading: 2 G620_T /Users/jdiazmej/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G620_T/"
## [1] "Loading: 3 G910_A_T /Users/jdiazmej/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G910_A_T/"
## [1] "Loading: 4 G945_I_T /Users/jdiazmej/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G945_I_T/"
## [1] "Loading: 5 G946_I_T /Users/jdiazmej/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G946_I_T/"
## [1] "Loading: 6 G967_A_T /Users/jdiazmej/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G967_A_T/"
## [1] "Loading: 7 G983_A_T /Users/jdiazmej/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G983_A_T/"

print(SeuratObjectsUnfiltered)

## [[1]]
## An object of class Seurat 
## 16895 features across 3905 samples within 1 assay 
## Active assay: RNA (16895 features, 0 variable features)
## 
## [[2]]
## An object of class Seurat 
## 16016 features across 1012 samples within 1 assay 
## Active assay: RNA (16016 features, 0 variable features)
## 
## [[3]]
## An object of class Seurat 
## 16037 features across 2897 samples within 1 assay 
## Active assay: RNA (16037 features, 0 variable features)
## 
## [[4]]
## An object of class Seurat 
## 15783 features across 1279 samples within 1 assay 
## Active assay: RNA (15783 features, 0 variable features)
## 
## [[5]]
## An object of class Seurat 
## 15193 features across 1510 samples within 1 assay 
## Active assay: RNA (15193 features, 0 variable features)
## 
## [[6]]
## An object of class Seurat 
## 15994 features across 3056 samples within 1 assay 
## Active assay: RNA (15994 features, 0 variable features)
## 
## [[7]]
## An object of class Seurat 
## 15964 features across 1624 samples within 1 assay 
## Active assay: RNA (15964 features, 0 variable features)

1.0.3 QC DATASETS

####################################
###Filter cells based on number of genes, number of reads, and mitochondrial representation
####################################
SeuratObjectsFiltered <-list()

for (datasetNumber in c(1:length(DatasetIds))) {
  dataset <- DatasetIds[[as.character(datasetNumber)]]
  DatasetPath <- PathToDatasets[[dataset]]
  seurat.object.u <- SeuratObjectsUnfiltered[[datasetNumber]]
  print(paste("Filtering:", datasetNumber, dataset, DatasetPath, sep = " "))
  
  mitoRegExpressions <- paste(c("^MT-"), collapse = "|")
  mito.features <- grep(pattern = mitoRegExpressions, 
                        ignore.case = T, x = rownames(x = SeuratObjectsUnfiltered[[datasetNumber]]), value = T)

  ####################################
  ### Get mitochondrial genes
  ####################################
  if (length(mito.features)[[1]] > 0) {
    mito.fraction <- Matrix::colSums(x = GetAssayData(object = seurat.object.u, slot = 'counts')[mito.features, ]) / Matrix::colSums(x = GetAssayData(object = seurat.object.u, slot = 'counts'))
    seurat.object.u[['mito.fraction']] <- mito.fraction
  }else{
    mito.fraction <- 0 / Matrix::colSums(x = GetAssayData(object = seurat.object.u, slot = 'counts'))
    seurat.object.u[['mito.fraction']] <- mito.fraction
  }
  
  ####################################
  ### Generate violin plots
  ####################################
  VlnPlotPdf<-paste0(PathForOutfiles, "/", PrefixOutfiles, "_QC_ViolinPlots_", dataset, ".pdf")
  pdf(file=VlnPlotPdf, width = 10, height = 5)
  print(VlnPlot(seurat.object.u, features = c("nFeature_RNA", "nCount_RNA", "mito.fraction"), ncol = 3))
  dev.off()
  print(VlnPlot(seurat.object.u, features = c("nFeature_RNA", "nCount_RNA", "mito.fraction"), ncol = 3))

  ####################################
  ### Filter cells based gene counts, number of reads, ribosomal and mitochondrial representation
  ####################################
  if (length(mito.features)[[1]] > 0) {
    seurat.object.f<-subset(x = seurat.object.u, subset = 
                              nFeature_RNA  >= DefaultParameters$MinGenes
                            & nCount_RNA    <= DefaultParameters$MaxReads
                            & mito.fraction <= DefaultParameters$MaxMitoFraction)
    
  }else{
    seurat.object.f<-subset(x = seurat.object.u, subset = 
                              nFeature_RNA  >= DefaultParameters$MinGenes
                            & nCount_RNA    <= DefaultParameters$MaxReads)
  }
  
  SeuratObjectsFiltered[[datasetNumber]]  <- seurat.object.f

}

## [1] "Filtering: 1 G1003_A_T ~/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G1003_A_T/"

## [1] "Filtering: 2 G620_T ~/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G620_T/"

## [1] "Filtering: 3 G910_A_T ~/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G910_A_T/"

## [1] "Filtering: 4 G945_I_T ~/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G945_I_T/"

## [1] "Filtering: 5 G946_I_T ~/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G946_I_T/"

## [1] "Filtering: 6 G967_A_T ~/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G967_A_T/"

## [1] "Filtering: 7 G983_A_T ~/CourseData/CAN_data/Module7/MTX_FILES/Richards_NatCancer_2021/G983_A_T/"

####################################
### Merge Seurat objects RNA assay
####################################
FirstSeuratObject   <- SeuratObjectsFiltered[[1]]
RestOfSeuratObjectsFiltered <- SeuratObjectsFiltered[c(2:length(DatasetIds))]
RestOfDatasetsIds    <- unlist(DatasetIds[c(2:length(DatasetIds))])

seurat.object.merged <- merge(FirstSeuratObject, y = RestOfSeuratObjectsFiltered, project = PrefixOutfiles)
seurat.object.merged <- AddMetaData(object = seurat.object.merged, metadata = seurat.object.merged@meta.data$dataset.label, col.name = "dataset")
seurat.object.list <- SplitObject(seurat.object.merged, split.by = "dataset.label")

print(seurat.object.list)

## $G1003_A_T
## An object of class Seurat 
## 17974 features across 3689 samples within 1 assay 
## Active assay: RNA (17974 features, 0 variable features)
## 
## $G620_T
## An object of class Seurat 
## 17974 features across 964 samples within 1 assay 
## Active assay: RNA (17974 features, 0 variable features)
## 
## $G910_A_T
## An object of class Seurat 
## 17974 features across 2815 samples within 1 assay 
## Active assay: RNA (17974 features, 0 variable features)
## 
## $G945_I_T
## An object of class Seurat 
## 17974 features across 662 samples within 1 assay 
## Active assay: RNA (17974 features, 0 variable features)
## 
## $G946_I_T
## An object of class Seurat 
## 17974 features across 1447 samples within 1 assay 
## Active assay: RNA (17974 features, 0 variable features)
## 
## $G967_A_T
## An object of class Seurat 
## 17974 features across 3019 samples within 1 assay 
## Active assay: RNA (17974 features, 0 variable features)
## 
## $G983_A_T
## An object of class Seurat 
## 17974 features across 1536 samples within 1 assay 
## Active assay: RNA (17974 features, 0 variable features)

1.0.4 NORMALIZE DATASETS

####################################
### Running SCTransform
####################################
for (i in 1:length(seurat.object.list)) {
  dataset <- names(seurat.object.list)[[i]]
  print(paste("Normalizing:", i, dataset, sep = " "))
  seurat.object.list[[i]] <- SCTransform(seurat.object.list[[i]], verbose = F)
}

## [1] "Normalizing: 1 G1003_A_T"
## [1] "Normalizing: 2 G620_T"
## [1] "Normalizing: 3 G910_A_T"
## [1] "Normalizing: 4 G945_I_T"
## [1] "Normalizing: 5 G946_I_T"
## [1] "Normalizing: 6 G967_A_T"
## [1] "Normalizing: 7 G983_A_T"

1.0.5 SAVE EACH DATASET R_OBJECT

####################################
### Saving each dataset R object
####################################
for (i in 1:length(seurat.object.list)) {
  dataset <- names(seurat.object.list)[[i]]
  print(paste("Saving:", i, dataset, sep = " "))
  OutfileRDS<-paste0(PathForOutfiles, "/", PrefixOutfiles, "_", dataset , "_QC_Normalization.rds")
  saveRDS(seurat.object.list[[i]], file = OutfileRDS)
}

## [1] "Saving: 1 G1003_A_T"
## [1] "Saving: 2 G620_T"
## [1] "Saving: 3 G910_A_T"
## [1] "Saving: 4 G945_I_T"
## [1] "Saving: 5 G946_I_T"
## [1] "Saving: 6 G967_A_T"
## [1] "Saving: 7 G983_A_T"

1.0.6 OBTAIN COMPUTING TIME

####################################
### Obtain computing time used
####################################
StopWatchEnd$Overall  <- Sys.time()

OutfileCPUtimes<-paste0(PathForOutfiles, "/", PrefixOutfiles, "_QC_Normalization.txt")

Headers<-paste("Step", "Time(minutes)", sep="\t")
write.table(Headers, file = OutfileCPUtimes, row.names = F, col.names = F, sep="\t", quote = F, append = T)

lapply(names(StopWatchStart), function(stepToClock) {
  if (regexpr("POSIXct", class(StopWatchStart[[stepToClock]]), ignore.case = T)[1] == 1) {
    TimeStart <- StopWatchStart[[stepToClock]]
    TimeEnd   <- StopWatchEnd[[stepToClock]]
    TimeDiff <- format(difftime(TimeEnd, TimeStart, units = "min"))
    ReportTime<-c(paste(stepToClock, TimeDiff, sep = "\t", collapse = ""))
    write(file = OutfileCPUtimes, x=gsub(pattern = " mins", replacement = "", x = ReportTime), append = T)
  }
})

## [[1]]
## NULL

1.0.7 FINISH

####################################
### Finish
####################################
options(warn = oldw)
writeLines(paste0("\nEND - Check:\n", PathForOutfiles, "\nFor outfiles\n\n"))

## 
## END - Check:
## /Users/jdiazmej/CourseData/CAN_data/Module7/OUTFILES/Richards_NatCancer_2021
## For outfiles

# quit()

Javier Diaz - javier.diazmejia@gmail.com
Script based on: https://satijalab.org/seurat/articles/integration_introduction.html
GENERAL OVERVIEW OF THIS SCRIPT
1. Loads each normalized dataset R object produced by script CBW_CAN_2021_Module7_Lab1_QC_Normalization.R
2. Merges and integrates datasets correcting batch effects
3. Saves R object with integrated datasets

1.0.8 DEFINE ENVIRONMENT, INPUTS AND OUTDIR

oldw <- getOption("warn")
options( warn = -1 )

1.0.8.0.0.1 Required libraries

library(Seurat) # (CRAN) main scRNA-seq analysis package
library(future) # (CRAN) to run parallel processes
library(STACAS) # (GitHub carmonalab/STACAS) for STACAS-based anchor detection
#comment

1.0.8.0.0.2 User’s home and stopwatch to time run

UserHomeDirectory <- Sys.getenv("HOME")[[1]]
print(UserHomeDirectory)

## [1] "/Users/jdiazmej"

StopWatchStart <- list()
StopWatchEnd   <- list()

StopWatchStart$Overall  <- Sys.time()

1.0.8.0.0.3 Define inputs

### Run specific inputs and parameters

AnchorFinder        <- "STACAS" ## Either 'STACAS' or 'SEURAT'

### /path_to/*rds infiles (R objects) from CBW_CAN_2021_Module7_Lab1__QC_Normalization.R

### Not including G945_I_T due to memory constrains and because it has high mito.fraction

PathToDatasets <- list(
  "G1003_A_T" = "~/CourseData/CAN_data/Module7/OUTFILES/Richards_NatCancer_2021/Richards_NatCancer_2021_G1003_A_T_QC_Normalization.rds",
  # "G945_I_T"  = "~/CourseData/CAN_data/Module7/OUTFILES/Richards_NatCancer_2021/Richards_NatCancer_2021_G945_I_T_QC_Normalization.rds",
  "G910_A_T"  = "~/CourseData/CAN_data/Module7/OUTFILES/Richards_NatCancer_2021/Richards_NatCancer_2021_G910_A_T_QC_Normalization.rds",
  "G983_A_T"  = "~/CourseData/CAN_data/Module7/OUTFILES/Richards_NatCancer_2021/Richards_NatCancer_2021_G983_A_T_QC_Normalization.rds",
  "G620_T"    = "~/CourseData/CAN_data/Module7/OUTFILES/Richards_NatCancer_2021/Richards_NatCancer_2021_G620_T_QC_Normalization.rds",
  "G946_I_T"  = "~/CourseData/CAN_data/Module7/OUTFILES/Richards_NatCancer_2021/Richards_NatCancer_2021_G946_I_T_QC_Normalization.rds",
  "G967_A_T"  = "~/CourseData/CAN_data/Module7/OUTFILES/Richards_NatCancer_2021/Richards_NatCancer_2021_G967_A_T_QC_Normalization.rds"
)

1.0.8.0.0.4 Define outputs

### Outputs
PathForOutfiles     <- paste0("~/CourseData/CAN_data/Module7/OUTFILES/Richards_NatCancer_2021/", AnchorFinder, "/WITHOUT_G945_I_T") ## /path_to/out_directory
PrefixOutfiles      <- "Richards_NatCancer_2021"  ## Prefix for outfiles
PathForOutfiles     <- gsub("^~/",paste0(UserHomeDirectory,"/"), PathForOutfiles)
dir.create(path = PathForOutfiles, recursive = T, showWarnings = F)

1.0.8.0.0.5 Define default parameters

### Default parameters. Either suggested by Seurat developers, or tailored empirically.

### NOTE: when datasets being integrated have fewer anchors than needed for the integration,
### step IntegrateData() will produce an error:
### `Error in nn2(data = c(15.4534704633918, 15.4534704633918, 15.4534704633918,  : 
### Cannot find more nearest neighbours than there are points`
### To fix this using Seurat's anchor finder in step FindIntegrationAnchors() lower the k.filter value (e.g. 150, default = 200)
### To fix this using STACAS anchor finder, in step FilterAnchors.STACAS() increase the dist.pct and dist.thr values (e.g. 0.9, default = 0.8)
### More info: https://github.com/satijalab/seurat/issues/2056
DefaultParameters <- list(
  
  ### Parameters for dataset integration
  IntegrationNFeatures = 3000,
  StacasVarGenesIntegratedN = 500,
  PcaDimsUse = c(1:20),
  SeuratFindAnchorsKFilter = 200, #default 200
  StacasDistPct = 0.9, #default 0.8
  StacasDistThr = 0.9, #default 0.8
  
  ### ReferenceDatasets = either a <comma> delimited list of dataset ID(s) to be used as
  ### references for anchors or 'NA' to compare all-vs-all
  ReferenceDatasets = "NA"
)

1.0.8.0.0.6 Report R sessionInfo

OutfileRSessionInfo<-paste0(PathForOutfiles, "/", PrefixOutfiles, "_SingleCell_2_Integration_RSessionInfo.txt")
writeLines(capture.output(sessionInfo()), OutfileRSessionInfo)
capture.output(sessionInfo())

##  [1] "R version 4.0.2 (2020-06-22)"                                                                                           
##  [2] "Platform: x86_64-apple-darwin17.0 (64-bit)"                                                                             
##  [3] "Running under: macOS High Sierra 10.13.6"                                                                               
##  [4] ""                                                                                                                       
##  [5] "Matrix products: default"                                                                                               
##  [6] "BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib"
##  [7] "LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib"                                    
##  [8] ""                                                                                                                       
##  [9] "locale:"                                                                                                                
## [10] "[1] en_CA.UTF-8/en_CA.UTF-8/en_CA.UTF-8/C/en_CA.UTF-8/en_CA.UTF-8"                                                      
## [11] ""                                                                                                                       
## [12] "attached base packages:"                                                                                                
## [13] "[1] stats     graphics  grDevices utils     datasets  methods   base     "                                              
## [14] ""                                                                                                                       
## [15] "other attached packages:"                                                                                               
## [16] "[1] STACAS_1.1.0       future_1.21.0      SeuratObject_4.0.1 Seurat_4.0.2      "                                        
## [17] ""                                                                                                                       
## [18] "loaded via a namespace (and not attached):"                                                                             
## [19] "  [1] Rtsne_0.15            colorspace_2.0-1      deldir_0.2-10         ellipsis_0.3.2       "                          
## [20] "  [5] ggridges_0.5.3        rstudioapi_0.13       spatstat.data_2.1-0   farver_2.1.0         "                          
## [21] "  [9] leiden_0.3.8          listenv_0.8.0         ggrepel_0.9.1         fansi_0.5.0          "                          
## [22] " [13] codetools_0.2-18      splines_4.0.2         knitr_1.33            polyclip_1.10-0      "                          
## [23] " [17] jsonlite_1.7.2        packrat_0.6.0         ica_1.0-2             cluster_2.1.2        "                          
## [24] " [21] png_0.1-7             uwot_0.1.10           shiny_1.6.0           sctransform_0.3.2    "                          
## [25] " [25] spatstat.sparse_2.0-0 compiler_4.0.2        httr_1.4.2            assertthat_0.2.1     "                          
## [26] " [29] Matrix_1.3-3          fastmap_1.1.0         lazyeval_0.2.2        later_1.2.0          "                          
## [27] " [33] htmltools_0.5.1.1     tools_4.0.2           igraph_1.2.6          gtable_0.3.0         "                          
## [28] " [37] glue_1.4.2            RANN_2.6.1            reshape2_1.4.4        dplyr_1.0.6          "                          
## [29] " [41] Rcpp_1.0.6            scattermore_0.7       jquerylib_0.1.4       vctrs_0.3.8          "                          
## [30] " [45] nlme_3.1-152          lmtest_0.9-38         xfun_0.23             stringr_1.4.0        "                          
## [31] " [49] globals_0.14.0        mime_0.10             miniUI_0.1.1.1        lifecycle_1.0.0      "                          
## [32] " [53] irlba_2.3.3           goftest_1.2-2         MASS_7.3-54           zoo_1.8-9            "                          
## [33] " [57] scales_1.1.1          spatstat.core_2.1-2   promises_1.2.0.1      spatstat.utils_2.1-0 "                          
## [34] " [61] parallel_4.0.2        RColorBrewer_1.1-2    yaml_2.2.1            reticulate_1.20      "                          
## [35] " [65] pbapply_1.4-3         gridExtra_2.3         ggplot2_3.3.3         sass_0.4.0           "                          
## [36] " [69] rpart_4.1-15          stringi_1.6.2         highr_0.9             rlang_0.4.11         "                          
## [37] " [73] pkgconfig_2.0.3       matrixStats_0.59.0    evaluate_0.14         lattice_0.20-44      "                          
## [38] " [77] ROCR_1.0-11           purrr_0.3.4           tensor_1.5            labeling_0.4.2       "                          
## [39] " [81] patchwork_1.1.1       htmlwidgets_1.5.3     cowplot_1.1.1         tidyselect_1.1.1     "                          
## [40] " [85] parallelly_1.25.0     RcppAnnoy_0.0.18      plyr_1.8.6            magrittr_2.0.1       "                          
## [41] " [89] bookdown_0.22         R6_2.5.0              generics_0.1.0        DBI_1.1.1            "                          
## [42] " [93] withr_2.4.2           mgcv_1.8-35           pillar_1.6.1          fitdistrplus_1.1-5   "                          
## [43] " [97] survival_3.2-11       abind_1.4-5           tibble_3.1.2          future.apply_1.7.0   "                          
## [44] "[101] crayon_1.4.1          KernSmooth_2.23-20    utf8_1.2.1            spatstat.geom_2.1-0  "                          
## [45] "[105] plotly_4.9.3          rmarkdown_2.8         grid_4.0.2            data.table_1.14.0    "                          
## [46] "[109] digest_0.6.27         xtable_1.8-4          tidyr_1.1.3           httpuv_1.6.1         "                          
## [47] "[113] munsell_0.5.0         viridisLite_0.4.0     bslib_0.2.5.1        "

1.0.8.0.0.7 Define number of cores and RAM for parallelization

NumbCores <- 1
MaxGlobalVariables <- 10000

### Using plan(strategy = "multisession", workers = NumbCoresToUse)
### failed in the CBW AWS instance, with error:
###   Failed to retrieve the result of MulticoreFuture (future_lapply-1) from the forked worker (on localhost; PID 194469).
###   Post-mortem diagnostic: No process exists with this PID, i.e. the forked localhost worker is no longer alive.
### This seems to be related to not having enough RAM in the AWS instance allocated for the course
### To fix this for the CBW CAN, we are:
###   a) removing from the analysis a dataset (G945_I_T) with high mito.fraction
###   b) Using `plan(strategy = "sequential")` in script CBW_CAN_SingleCell_2_Integration.R (i.e. no parallelization)
### 
### Notes: `options(future.globals.maxSize = MaxGlobalVariables)` sets the maximum allowed size in bytes.
###         To set it to 10GB, you would run options(future.globals.maxSize = 10000 * 1024^2).
###         Increasing the value of future.globals.maxSize will increase your RAM usage so set this number mindfully.
###         https://satijalab.org/seurat/archive/v3.1/future_vignette.html
###         https://github.com/satijalab/seurat/issues/3249

if (regexpr("^MAX$", NumbCores, ignore.case = T)[1] == 1) {
  NumbCoresToUse <- availableCores()[[1]]
}else if (regexpr("^[0-9]+$", NumbCores, ignore.case = T)[1] == 1) {
  NumbCoresToUse <- as.numeric(NumbCores)
}else{
  stop(paste0("Unexpected format for NumbCores: ", NumbCores, "\n"))
}

if (NumbCoresToUse == 1) {
  plan(strategy = "sequential")
  writeLines(paste0("\n", "*** Running: in 'sequential' mode with ", NumbCoresToUse, " core ***", "\n"))
}else if (NumbCoresToUse > 1) {
  # plan(strategy = "multicore", workers = NumbCoresToUse)
  plan(strategy = "multisession", workers = NumbCoresToUse)
  writeLines(paste0("\n", "*** Running: in 'multicore' mode with ", NumbCoresToUse, " cores ***", "\n"))
}else{
  stop(paste0("Unexpected NumbCores = ", NumbCoresToUse))
}

## 
## *** Running: in 'sequential' mode with 1 core ***

options(future.globals.maxSize = MaxGlobalVariables * 1024^2)

1.0.9 LOAD R OBJECTS

####################################
### Load R Objects
####################################
InputsTable <- data.frame(matrix(unlist(PathToDatasets), nrow=length(PathToDatasets), byrow=TRUE))
rownames(InputsTable) <- names(PathToDatasets)
colnames(InputsTable)<-c("PathToRObject")
NumberOfDatasets <- nrow(InputsTable)

seurat.object.list <- list()
for (dataset in rownames(InputsTable)) {
  print(dataset)
  DatasetIndexInInputsTable <- which(x = rownames(InputsTable) == dataset)
  InputRobject <- InputsTable[dataset,"PathToRObject"]
  seurat.object.list[[DatasetIndexInInputsTable]] <- readRDS(InputRobject)
}

## [1] "G1003_A_T"
## [1] "G910_A_T"
## [1] "G983_A_T"
## [1] "G620_T"
## [1] "G946_I_T"
## [1] "G967_A_T"

1.0.10 INTEGRATE DATASETS

####################################
### Get reference datasets
####################################
if (regexpr("^NA$", DefaultParameters$ReferenceDatasets , ignore.case = T)[1] == 1) {
  ReferenceDatasets.indices <- c(1:nrow(InputsTable))
  print("Will run all pairwise dataset comparisons")
}else{
  ReferenceDatasets.list <- unlist(strsplit(DefaultParameters$ReferenceDatasets, ","))
  NumberOfFoundReferenceDatasetIDs <- sum(ReferenceDatasets.list %in% rownames(InputsTable) == T)
  if (NumberOfFoundReferenceDatasetIDs == length(ReferenceDatasets.list)) {
    ReferenceDatasets.indices <- match(ReferenceDatasets.list, rownames(InputsTable))
  }else{
    stop(paste0("Requested ", length(ReferenceDatasets.list), " datasets as references, but found ", NumberOfFoundReferenceDatasetIDs))
  }
  print(paste0("Will use datasets: ", paste(as.character(ReferenceDatasets.indices), sep = "", collapse = ",")))
}

## [1] "Will run all pairwise dataset comparisons"

####################################
### Get integration anchors
####################################
if (regexpr("^STACAS$", AnchorFinder, ignore.case = T)[1] == 1) {
  
  ####################################
  ### Get anchors with STACAS
  ####################################
  print("Get integration anchors with STACAS")
  seurat.object.anchors.unfiltered <- FindAnchors.STACAS(object.list = seurat.object.list, dims=DefaultParameters$PcaDimsUse,
                                                         anchor.features=DefaultParameters$StacasVarGenesIntegratedN, reference = ReferenceDatasets.indices, verbose = F)
  seurat.object.anchors <- FilterAnchors.STACAS(seurat.object.anchors.unfiltered, dist.thr = DefaultParameters$StacasDistPct, DefaultParameters$StacasDistThr)
  SampleTree <- SampleTree.STACAS(seurat.object.anchors)

} else if (regexpr("^Seurat$", AnchorFinder , ignore.case = T)[1] == 1) {
  
  ####################################
  ### Get anchors with Seurat
  ####################################
  print("Get integration anchors with SEURAT")
  seurat.object.integratedfeatures <- SelectIntegrationFeatures(object.list = seurat.object.list, nfeatures = DefaultParameters$IntegrationNFeatures, verbose = F)
  seurat.object.list <- PrepSCTIntegration(object.list = seurat.object.list, anchor.features = seurat.object.integratedfeatures, verbose = F)
  seurat.object.anchors <- FindIntegrationAnchors(object.list = seurat.object.list, k.filter = DefaultParameters$SeuratFindAnchorsKFilter, normalization.method = "SCT",
                                                  dims=DefaultParameters$PcaDimsUse, anchor.features = seurat.object.integratedfeatures, reference = ReferenceDatasets.indices,
                                                  verbose = F)
  SampleTree <- NULL
  
} else {
  stop("ERROR: unexpected anchors function")
}

## [1] "Get integration anchors with STACAS"

## Preparing PCA embeddings for objects...

##  1/6 2/6 3/6 4/6 5/6 6/6

## Filter anchors using distance threshold t=0.900

print(seurat.object.anchors)

## An AnchorSet object containing 15946 anchors between 6 Seurat objects 
##  This can be used as input to IntegrateData.

1.0.11 INTEGRATE DATASETS

####################################
### Integrate datasets
####################################
seurat.object.integrated <- IntegrateData(anchorset = seurat.object.anchors, normalization.method = "SCT", sample.tree = SampleTree, preserve.order = T, verbose = T)

## Merging dataset 4 into 1

## Extracting anchors for merged samples

## Finding integration vectors

## Finding integration vector weights

## Integrating data

## Merging dataset 2 into 5

## Extracting anchors for merged samples

## Finding integration vectors

## Finding integration vector weights

## Integrating data

## Merging dataset 3 into 1 4

## Extracting anchors for merged samples

## Finding integration vectors

## Finding integration vector weights

## Integrating data

## Merging dataset 5 2 into 1 4 3

## Extracting anchors for merged samples

## Finding integration vectors

## Finding integration vector weights

## Integrating data

## Merging dataset 6 into 1 4 3 5 2

## Extracting anchors for merged samples

## Finding integration vectors

## Finding integration vector weights

## Integrating data

print(seurat.object.integrated)

## An object of class Seurat 
## 34523 features across 13470 samples within 3 assays 
## Active assay: integrated (500 features, 500 variable features)
##  2 other assays present: RNA, SCT

1.0.12 SAVE INTEGRATED R_OBJECT

####################################
### Saving integrated R object
####################################
OutfileRDS<-paste0(PathForOutfiles, "/", PrefixOutfiles, "_Integration.rds")
saveRDS(seurat.object.integrated, file = OutfileRDS)

1.0.13 OBTAIN COMPUTING TIME

####################################
### Obtain computing time used
####################################
StopWatchEnd$Overall  <- Sys.time()

OutfileCPUtimes<-paste0(PathForOutfiles, "/", PrefixOutfiles, "_Integration_CPUtimes.txt")

write(file = OutfileCPUtimes, x = paste("#Number_of_cores_used", NumbCoresToUse, sep = "\t", collapse = ""))
write(file = OutfileCPUtimes, x = paste("#MaxGlobalVariables", MaxGlobalVariables, sep = "\t", collapse = ""), append = T)

Headers<-paste("Step", "Time(minutes)", sep="\t")
write.table(Headers, file = OutfileCPUtimes, row.names = F, col.names = F, sep="\t", quote = F, append = T)

lapply(names(StopWatchStart), function(stepToClock) {
  if (regexpr("POSIXct", class(StopWatchStart[[stepToClock]]), ignore.case = T)[1] == 1) {
    TimeStart <- StopWatchStart[[stepToClock]]
    TimeEnd   <- StopWatchEnd[[stepToClock]]
    TimeDiff <- format(difftime(TimeEnd, TimeStart, units = "min"))
    ReportTime<-c(paste(stepToClock, TimeDiff, sep = "\t", collapse = ""))
    write(file = OutfileCPUtimes, x=gsub(pattern = " mins", replacement = "", x = ReportTime), append = T)
  }
})

## [[1]]
## NULL

1.0.14 FINISH

####################################
### Finish
####################################
options(warn = oldw)
writeLines(paste0("\nEND - Check:\n", PathForOutfiles, "\nFor outfiles\n\n"))

## 
## END - Check:
## /Users/jdiazmej/CourseData/CAN_data/Module7/OUTFILES/Richards_NatCancer_2021/STACAS/WITHOUT_G945_I_T
## For outfiles

# quit()

Javier Diaz - javier.diazmejia@gmail.com
Script based on: https://satijalab.org/seurat/articles/integration_introduction.html
GENERAL OVERVIEW OF THIS SCRIPT
1. Loads integrated datasets R object produced by script CBW_CAN_2021_Module7_Lab2_Integration.R
2. Process integrated datasets as a whole, including:
- dimension reduction
- ‘global’ cell clustering
- UMAP plots by global cell clusters, sample, requested genes and metadata
- Violin plots
- Dot plots
1. Saves R object with PCA, clustering and dimension reduction

1.0.15 DEFINE ENVIRONMENT, INPUTS AND OUTDIR

oldw <- getOption("warn")
options( warn = -1 )

1.0.15.0.0.1 Required libraries

library(Seurat)  # (CRAN) main scRNA-seq analysis package
library(future)  # (CRAN) to run parallel processes
library(ggplot2) # (CRAN) to generate enhanced plots
library(DropletUtils) # (Bioconductor) to write out MTX format files

## Loading required package: SingleCellExperiment

## Loading required package: SummarizedExperiment

## Loading required package: GenomicRanges

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap,
##     parApply, parCapply, parLapply, parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames, dirname, do.call,
##     duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted,
##     lapply, Map, mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:future':
## 
##     values

## The following object is masked from 'package:base':
## 
##     expand.grid

## Loading required package: IRanges

## Loading required package: GenomeInfoDb

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with 'browseVignettes()'. To cite
##     Bioconductor, see 'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: DelayedArray

## Loading required package: matrixStats

## 
## Attaching package: 'matrixStats'

## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians

## 
## Attaching package: 'DelayedArray'

## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges

## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum

## 
## Attaching package: 'SummarizedExperiment'

## The following object is masked from 'package:SeuratObject':
## 
##     Assays

## The following object is masked from 'package:Seurat':
## 
##     Assays

1.0.15.0.0.2 User’s home and stopwatch to time run

UserHomeDirectory <- Sys.getenv("HOME")[[1]]
print(UserHomeDirectory)

## [1] "/Users/jdiazmej"

StopWatchStart <- list()
StopWatchEnd   <- list()

StopWatchStart$Overall  <- Sys.time()

1.0.15.0.0.3 Define inputs

### Run specific inputs and parameters

AnchorFinder        <- "STACAS" ## Either 'STACAS' or 'SEURAT'
PathToIntegratedRds <- paste0("~/CourseData/CAN_data/Module7/OUTFILES/Richards_NatCancer_2021/", AnchorFinder, "/WITHOUT_G945_I_T", "/Richards_NatCancer_2021_Integration.rds") ### /path_to/*rds (R object) from CBW_CAN_2021_Module7_Lab2_Integration.R
InfileMetadata      <- "~/CourseData/CAN_data/Module7/METADATA/Richards_NatCancer_2021_without_G945_I_T.metadata.tsv"
MetedataPropsToPlot <- c("cell_cluster.cell_type", "cell_type")
PathToIntegratedRds <- gsub("^~/",paste0(UserHomeDirectory,"/"), PathToIntegratedRds)
SelectedGenes       <- c("PTPRC", "MOG", "MAG", "EGFR", "CD3D", "CD2", "ITGAM", "FCGR3A", "CD14", "TMEM119")
ASSAY               <- "SCT"
NumberOfDimensions  <- 20
ClustersForDotPlot2 <- c(0,1,4,8,10)
DimRedMethodPlots   <- "umap"
SampleForInferCNV   <- "G983_A_T" ## Filtered counts and cell cluster identities from this sample will be used for CBW_CAN_SingleCell_Lab6_InferCNV.R

1.0.15.0.0.4 Define outputs

### Outputs
PathForOutfiles     <- paste0("~/CourseData/CAN_data/Module7/OUTFILES/Richards_NatCancer_2021/", AnchorFinder, "/WITHOUT_G945_I_T") ## /path_to/out_directory
PrefixOutfiles      <- "Richards_NatCancer_2021"  ## Prefix for outfiles
PathForOutfiles     <- gsub("^~/",paste0(UserHomeDirectory,"/"), PathForOutfiles)
dir.create(path = PathForOutfiles, recursive = T, showWarnings = F)

1.0.15.0.0.5 Define default parameters

### Default parameters. Either suggested by Seurat developers, or tailored empirically.
DefaultParameters <- list(
  
  ### Parameters for QC plots
  CellPropertiesToQC = c("nFeature_RNA", "nCount_RNA", "mito.fraction"),
  
  ### Parameters for clustering
  Resolution = 0.5,

  ### Parameters for datasets comparison
  AssaysForAverageGETables = c("RNA", "SCT"),
  
  ### Parameters for t-SNE plots
  MinNumberOfCellsToReducePerplexity = 150,
  ReducedPerplexity = 7
)

### Dimension reduction methods
DimensionReductionMethods<-list()
DimensionReductionMethods$umap$name <-"UMAP"
DimensionReductionMethods$tsne$name <-"TSNE"
DimensionReductionMethods$umap$run  <-as.function(RunUMAP)
DimensionReductionMethods$tsne$run  <-as.function(RunTSNE)

1.0.15.0.0.6 Report R sessionInfo

OutfileRSessionInfo<-paste0(PathForOutfiles, "/", PrefixOutfiles, "_SingleCell_3_PCA_Clustering_DimReduction_RSessionInfo.txt")
writeLines(capture.output(sessionInfo()), OutfileRSessionInfo)
capture.output(sessionInfo())

##  [1] "R version 4.0.2 (2020-06-22)"                                                                                           
##  [2] "Platform: x86_64-apple-darwin17.0 (64-bit)"                                                                             
##  [3] "Running under: macOS High Sierra 10.13.6"                                                                               
##  [4] ""                                                                                                                       
##  [5] "Matrix products: default"                                                                                               
##  [6] "BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib"
##  [7] "LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib"                                    
##  [8] ""                                                                                                                       
##  [9] "locale:"                                                                                                                
## [10] "[1] en_CA.UTF-8/en_CA.UTF-8/en_CA.UTF-8/C/en_CA.UTF-8/en_CA.UTF-8"                                                      
## [11] ""                                                                                                                       
## [12] "attached base packages:"                                                                                                
## [13] "[1] parallel  stats4    stats     graphics  grDevices utils     datasets  methods   base     "                          
## [14] ""                                                                                                                       
## [15] "other attached packages:"                                                                                               
## [16] " [1] DropletUtils_1.8.0          SingleCellExperiment_1.10.1 SummarizedExperiment_1.18.2"                               
## [17] " [4] DelayedArray_0.14.1         matrixStats_0.59.0          Biobase_2.48.0             "                               
## [18] " [7] GenomicRanges_1.40.0        GenomeInfoDb_1.24.2         IRanges_2.22.2             "                               
## [19] "[10] S4Vectors_0.26.1            BiocGenerics_0.34.0         ggplot2_3.3.3              "                               
## [20] "[13] STACAS_1.1.0                future_1.21.0               SeuratObject_4.0.1         "                               
## [21] "[16] Seurat_4.0.2               "                                                                                       
## [22] ""                                                                                                                       
## [23] "loaded via a namespace (and not attached):"                                                                             
## [24] "  [1] plyr_1.8.6             igraph_1.2.6           lazyeval_0.2.2         splines_4.0.2         "                      
## [25] "  [5] BiocParallel_1.22.0    listenv_0.8.0          scattermore_0.7        digest_0.6.27         "                      
## [26] "  [9] htmltools_0.5.1.1      fansi_0.5.0            magrittr_2.0.1         tensor_1.5            "                      
## [27] " [13] cluster_2.1.2          ROCR_1.0-11            limma_3.44.3           globals_0.14.0        "                      
## [28] " [17] R.utils_2.10.1         spatstat.sparse_2.0-0  colorspace_2.0-1       ggrepel_0.9.1         "                      
## [29] " [21] xfun_0.23              dplyr_1.0.6            crayon_1.4.1           RCurl_1.98-1.3        "                      
## [30] " [25] jsonlite_1.7.2         spatstat.data_2.1-0    survival_3.2-11        zoo_1.8-9             "                      
## [31] " [29] glue_1.4.2             polyclip_1.10-0        gtable_0.3.0           zlibbioc_1.34.0       "                      
## [32] " [33] XVector_0.28.0         leiden_0.3.8           Rhdf5lib_1.10.1        future.apply_1.7.0    "                      
## [33] " [37] HDF5Array_1.16.1       abind_1.4-5            scales_1.1.1           edgeR_3.30.3          "                      
## [34] " [41] DBI_1.1.1              miniUI_0.1.1.1         Rcpp_1.0.6             viridisLite_0.4.0     "                      
## [35] " [45] xtable_1.8-4           dqrng_0.3.0            reticulate_1.20        spatstat.core_2.1-2   "                      
## [36] " [49] htmlwidgets_1.5.3      httr_1.4.2             RColorBrewer_1.1-2     ellipsis_0.3.2        "                      
## [37] " [53] ica_1.0-2              R.methodsS3_1.8.1      pkgconfig_2.0.3        farver_2.1.0          "                      
## [38] " [57] sass_0.4.0             uwot_0.1.10            deldir_0.2-10          locfit_1.5-9.4        "                      
## [39] " [61] utf8_1.2.1             tidyselect_1.1.1       labeling_0.4.2         rlang_0.4.11          "                      
## [40] " [65] reshape2_1.4.4         later_1.2.0            munsell_0.5.0          tools_4.0.2           "                      
## [41] " [69] generics_0.1.0         ggridges_0.5.3         evaluate_0.14          stringr_1.4.0         "                      
## [42] " [73] fastmap_1.1.0          yaml_2.2.1             goftest_1.2-2          knitr_1.33            "                      
## [43] " [77] fitdistrplus_1.1-5     purrr_0.3.4            RANN_2.6.1             packrat_0.6.0         "                      
## [44] " [81] pbapply_1.4-3          nlme_3.1-152           mime_0.10              R.oo_1.24.0           "                      
## [45] " [85] compiler_4.0.2         rstudioapi_0.13        plotly_4.9.3           png_0.1-7             "                      
## [46] " [89] spatstat.utils_2.1-0   tibble_3.1.2           bslib_0.2.5.1          stringi_1.6.2         "                      
## [47] " [93] highr_0.9              lattice_0.20-44        Matrix_1.3-3           vctrs_0.3.8           "                      
## [48] " [97] pillar_1.6.1           lifecycle_1.0.0        spatstat.geom_2.1-0    lmtest_0.9-38         "                      
## [49] "[101] jquerylib_0.1.4        RcppAnnoy_0.0.18       data.table_1.14.0      cowplot_1.1.1         "                      
## [50] "[105] bitops_1.0-7           irlba_2.3.3            httpuv_1.6.1           patchwork_1.1.1       "                      
## [51] "[109] R6_2.5.0               bookdown_0.22          promises_1.2.0.1       KernSmooth_2.23-20    "                      
## [52] "[113] gridExtra_2.3          parallelly_1.25.0      codetools_0.2-18       MASS_7.3-54           "                      
## [53] "[117] assertthat_0.2.1       rhdf5_2.32.4           withr_2.4.2            sctransform_0.3.2     "                      
## [54] "[121] GenomeInfoDbData_1.2.3 mgcv_1.8-35            grid_4.0.2             rpart_4.1-15          "                      
## [55] "[125] tidyr_1.1.3            rmarkdown_2.8          Rtsne_0.15             shiny_1.6.0           "

1.0.15.0.0.7 Define number of cores and RAM for parallelization

NumbCores <- "MAX"
MaxGlobalVariables <- 10000

if (regexpr("^MAX$", NumbCores, ignore.case = T)[1] == 1) {
  NumbCoresToUse <- availableCores()[[1]]
}else if (regexpr("^[0-9]+$", NumbCores, ignore.case = T)[1] == 1) {
  NumbCoresToUse <- as.numeric(NumbCores)
}else{
  stop(paste0("Unexpected format for NumbCores: ", NumbCores, "\n"))
}

if (NumbCoresToUse == 1) {
  plan(strategy = "sequential")
  writeLines(paste0("\n", "*** Running: in 'sequential' mode with ", NumbCoresToUse, " core ***", "\n"))
}else if (NumbCoresToUse > 1) {
  plan(strategy = "multicore", workers = NumbCoresToUse)
  writeLines(paste0("\n", "*** Running: in 'multicore' mode with ", NumbCoresToUse, " cores ***", "\n"))
}else{
  stop(paste0("Unexpected NumbCores = ", NumbCoresToUse))
}

## 
## *** Running: in 'multicore' mode with 4 cores ***

options(future.globals.maxSize = MaxGlobalVariables * 1024^2)

1.0.16 LOAD R OBJECT

####################################
### Load R Object
####################################
seurat.object.integrated <- readRDS(PathToIntegratedRds)

1.0.17 REDUCE DIMENSIONS

####################################
### Obtaining principal components
####################################
seurat.object.integrated <- RunPCA(seurat.object.integrated, verbose = F)

####################################
### Generate Elbow plot
####################################
seurat.object.integrated@active.assay <- ASSAY

ForElbowPlot<-ElbowPlot(object = seurat.object.integrated, ndims = 50, reduction = "pca")
MaxYAxis<-as.integer(max(ForElbowPlot$data$stdev)+1)

OutfilePCElbowPlot <- paste0(PathForOutfiles, "/",  PrefixOutfiles, "_PCElbowPlot", ".pdf")
pdf(file=OutfilePCElbowPlot, width = 7, height = 7)
print(ForElbowPlot
      + scale_x_continuous(breaks =  seq(from = 0, to = 50, by=5))
      + geom_vline(xintercept = seq(from = 0, to = 50, by=5), linetype='dotted', col="red")
      + scale_y_continuous(breaks =  seq(from = 0, to = MaxYAxis, by=0.5))
      + geom_hline(yintercept = seq(from = 0, to = MaxYAxis, by=0.5), linetype='dotted', col="red")
)
dev.off()

## quartz_off_screen 
##                 2

print(ForElbowPlot
      + scale_x_continuous(breaks =  seq(from = 0, to = 50, by=5))
      + geom_vline(xintercept = seq(from = 0, to = 50, by=5), linetype='dotted', col="red")
      + scale_y_continuous(breaks =  seq(from = 0, to = MaxYAxis, by=0.5))
      + geom_hline(yintercept = seq(from = 0, to = MaxYAxis, by=0.5), linetype='dotted', col="red")
)

####################################
### Run dimension reductions using integrated data
####################################
writeLines("\n*** Run dimension reductions using integrated data ***\n")

## 
## *** Run dimension reductions using integrated data ***

PcaDimsUse <- c(1:NumberOfDimensions)

for (dim_red_method in names(DimensionReductionMethods)) {
  ####################################
  ### Run non-linear dimension reductions using integrated data
  ####################################
  ### NOTES:
  ### In RunTSNE: if the number of cells is too small, user may get error:
  ### `Error in Rtsne.default(X = as.matrix(x = data.use), dims = dim.embed,  : Perplexity is too large.`
  ### User can try tunning down the default RunTSNE(..., perplexity=30) to say 5 or 10
  ###
  ### Also using RunTSNE(..., check_duplicates = F) to skip cases where cells happen to have the same values after PCA reduction
  
  if (("tsne" %in% dim_red_method == T) & (length(colnames(seurat.object.integrated)) < DefaultParameters$MinNumberOfCellsToReducePerplexity)) {
    writeLines(paste0("\n*** Using reduced perplexity = ", DefaultParameters$ReducedPerplexity, " because found ",  length(colnames(seurat.object.integrated)), " cells", " ***\n"))
    seurat.object.integrated <- DimensionReductionMethods[[dim_red_method]][["run"]](object = seurat.object.integrated, dims = PcaDimsUse, perplexity = DefaultParameters$ReducedPerplexity, check_duplicates = F)
  }else if ("tsne" %in% dim_red_method == T) {
    seurat.object.integrated <- DimensionReductionMethods[[dim_red_method]][["run"]](object = seurat.object.integrated, dims = PcaDimsUse, check_duplicates = F)
  }else if ("umap" %in% dim_red_method == T) {
    seurat.object.integrated <- DimensionReductionMethods[[dim_red_method]][["run"]](object = seurat.object.integrated, dims = PcaDimsUse, umap.method = "uwot")
  }
  
  ###################################
  ### Write out dimension reduction coordinates
  ####################################
  Outfile.con <- bzfile(paste0(PathForOutfiles, "/", PrefixOutfiles, "_", DimensionReductionMethods[[dim_red_method]][["name"]], "Plot_Coordinates", ".tsv.bz2"), "w")
  Headers<-paste("Barcode",paste(colnames(seurat.object.integrated@reductions[[dim_red_method]]@cell.embeddings), sep="", collapse="\t"), sep="\t", collapse = "\t")
  write.table(Headers, file = Outfile.con, row.names = F, col.names = F, sep="\t", quote = F)
  write.table(seurat.object.integrated@reductions[[dim_red_method]]@cell.embeddings, file = Outfile.con,  row.names = T, col.names = F, sep="\t", quote = F, append = T)
  close(Outfile.con)
}

## 14:30:52 UMAP embedding parameters a = 0.9922 b = 1.112

## 14:30:52 Read 13470 rows and found 20 numeric columns

## 14:30:52 Using Annoy for neighbor search, n_neighbors = 30

## 14:30:52 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 14:30:53 Writing NN index file to temp file /var/folders/6t/btp1jy1n5kx9qxp01fqkq7yc0000gp/T//RtmpWl0868/filee8424cc310f
## 14:30:53 Searching Annoy index using 4 threads, search_k = 3000
## 14:30:55 Annoy recall = 100%
## 14:30:56 Commencing smooth kNN distance calibration using 4 threads
## 14:30:57 Initializing from normalized Laplacian + noise
## 14:30:58 Commencing optimization for 200 epochs, with 577722 positive edges
## 14:31:05 Optimization finished

1.0.18 CLUSTER CELLS

####################################
### Globally cluster cells using integrated data
####################################
writeLines("\n*** Globally cluster cells using integrated data ***\n")

## 
## *** Globally cluster cells using integrated data ***

options(scipen=10) ## Needed to avoid an 'Error in file(file, "rt") : cannot open the connection'
seurat.object.integrated <- FindNeighbors(object = seurat.object.integrated, dims = PcaDimsUse)

## Computing nearest neighbor graph

## Computing SNN

seurat.object.integrated <- FindClusters(object = seurat.object.integrated, resolution = DefaultParameters$Resolution, graph.name = "integrated_snn")

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 13470
## Number of edges: 471947
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8971
## Number of communities: 14
## Elapsed time: 2 seconds

####################################
### Write out cell-cluster identities
####################################
CellBarcodes <- rownames(seurat.object.integrated@meta.data)
CellClusters <- seurat.object.integrated@meta.data$seurat_clusters
Headers<-paste("Cell_barcode", paste0("seurat_cluster_r", DefaultParameters$Resolution), sep="\t")
clusters_data<-paste(CellBarcodes, CellClusters, sep="\t")
Outfile.con <- bzfile(paste0(PathForOutfiles, "/", PrefixOutfiles, "_GlobalClustering_CellClusters.tsv.bz2"), "w")
write.table(Headers, file = Outfile.con, row.names = F, col.names = F, sep="\t", quote = F)
write.table(data.frame(clusters_data), file = Outfile.con, row.names = F, col.names = F, sep="\t", quote = F, append = T)
close(Outfile.con)

####################################
### Write out number of cells per cluster
####################################
Outfile.con <- bzfile(paste0(PathForOutfiles, "/", PrefixOutfiles, "_GlobalClustering_NumbCellsPerCluster.tsv.bz2"), "w")
Headers<-paste("Cluster", "Number_of_cells", sep="\t", collapse = "")
write.table(Headers, file = Outfile.con, row.names = F, col.names = F, sep="\t", quote = F)
write.table(table(CellClusters), file = Outfile.con, row.names = F, col.names = F, sep="\t", quote = F, append = T)
close(Outfile.con)

####################################
### Print out the integrated matrices and cell cluster identities for CBW_CAN_SingleCell_Lab6_InferCNV.R
####################################
Idents(seurat.object.integrated) <- "dataset"
### Subset sample
seurat.object.integrated.subset <- subset(x = seurat.object.integrated, idents = SampleForInferCNV)
print(seurat.object.integrated.subset)

## An object of class Seurat 
## 34523 features across 1536 samples within 3 assays 
## Active assay: SCT (16049 features, 0 variable features)
##  2 other assays present: RNA, integrated
##  3 dimensional reductions calculated: pca, umap, tsne

ASSAY <- "SCT"
### Write out count matrices in MTX format  
OutdirMTX <- paste0("~/CourseData/CAN_data/Module7/OUTFILES/Richards_NatCancer_2021/", AnchorFinder, "/WITHOUT_G945_I_T",
                    "/INTEGRATED_MATRIX_", SampleForInferCNV, "/", ASSAY, "/MTX")
dir.create(file.path(OutdirMTX), showWarnings = F, recursive = T)
write10xCounts(path = OutdirMTX, x = seurat.object.integrated.subset@assays[[ASSAY]]@counts, gene.type="Gene Expression", overwrite=T, type="sparse", version="3")

### Write out cell-cluster identities
CellBarcodes <- rownames(seurat.object.integrated.subset@meta.data)
CellClusters <- seurat.object.integrated.subset@meta.data$seurat_clusters
clusters_data<-paste(CellBarcodes, CellClusters, sep="\t")
Outfile.con <- bzfile(paste0(PathForOutfiles, "/", PrefixOutfiles, "_GlobalClustering_CellClusters_", SampleForInferCNV, ".tsv.bz2"), "w")
write.table(data.frame(clusters_data), file = Outfile.con, row.names = F, col.names = F, sep="\t", quote = F, append = T)
close(Outfile.con)

1.0.19 GENERATE UMAP PLOTS

####################################
### Colour dimension reduction plots by global cell-clusters
####################################
plots <- DimPlot(seurat.object.integrated, group.by = c("seurat_clusters"), combine = F, reduction = DimRedMethodPlots, label = T, label.size = 10)
plots <- lapply(X = plots, FUN = function(x) x + theme(legend.position = "right") + guides(color = guide_legend(override.aes = list(size = 3))))

OutfilePdf<-paste0(PathForOutfiles, "/", PrefixOutfiles, "_", DimensionReductionMethods[[DimRedMethodPlots]][["name"]], "Plot_GlobalClustering_ColourByCellClusters", ".pdf")
pdf(file=OutfilePdf, width = 8, height = 7)
print(plots)

## [[1]]

dev.off()

## quartz_off_screen 
##                 2

print(plots)

## [[1]]

####################################
### Colour dimension reduction plots by dataset
####################################
plots <- DimPlot(seurat.object.integrated, group.by = c("dataset"), combine = F, reduction = DimRedMethodPlots)
plots <- lapply(X = plots, FUN = function(x) x + theme(legend.position = "top") + guides(color = guide_legend(ncol = 6, override.aes = list(size = 3))))

OutfilePdf<-paste0(PathForOutfiles, "/", PrefixOutfiles, "_", DimensionReductionMethods[[DimRedMethodPlots]][["name"]], "Plot_GlobalClustering_ColourByDataset", ".pdf")
pdf(file=OutfilePdf, width = 7, height = 8)
print(plots)

## [[1]]

dev.off()

## quartz_off_screen 
##                 2

print(plots)

## [[1]]

####################################
### Colour dimension reduction plots for all cells by selected genes
####################################
seurat.object.integrated@active.assay <- "SCT"
plots <- FeaturePlot(object = seurat.object.integrated, features = SelectedGenes, cols = c("lightgrey", "blue"),
                     reduction = DimRedMethodPlots, order = T, slot = "data", pt.size = 0.3, min.cutoff = "q0.1", max.cutoff = "q90",
                     ncol = 3)

OutfilePdf<-paste0(PathForOutfiles, "/", PrefixOutfiles, "_", DimensionReductionMethods[[DimRedMethodPlots]][["name"]], "Plot_AllCells_ColourBySelectedGenes", ".pdf")
pdf(file=OutfilePdf, width=8, height=8)
print(plots)
dev.off()

## quartz_off_screen 
##                 2

print(plots)

####################################
### Colour dimension reduction plots using cell-level metadata
####################################
### Load Metadata
CellPropertiesFromMetadata <- data.frame(read.table(InfileMetadata, header = T, row.names = 1, check.names = F))
head(CellPropertiesFromMetadata)

##                            cell_cluster cell_cluster.cell_type cell_type found_in_cell_cluster
## G1003_A_T_AAACCTGAGTATCGAA            1           01.malignant malignant                     1
## G1003_A_T_AAACCTGCAGTCCTTC            9           09.malignant malignant                     1
## G1003_A_T_AAACCTGTCAGTTTGG           11           11.malignant malignant                     1
## G1003_A_T_AAACCTGTCGGAATCT            2           02.malignant malignant                     1
## G1003_A_T_AAACCTGTCGTTTAGG            2           02.malignant malignant                     1
## G1003_A_T_AAACGGGAGAAACCGC            2           02.malignant malignant                     1
##                            dataset_id monochrome
## G1003_A_T_AAACCTGAGTATCGAA  G1003_A_T          1
## G1003_A_T_AAACCTGCAGTCCTTC  G1003_A_T          1
## G1003_A_T_AAACCTGTCAGTTTGG  G1003_A_T          1
## G1003_A_T_AAACCTGTCGGAATCT  G1003_A_T          1
## G1003_A_T_AAACCTGTCGTTTAGG  G1003_A_T          1
## G1003_A_T_AAACGGGAGAAACCGC  G1003_A_T          1

### Add Metadata to Seurat object
head(seurat.object.integrated@meta.data)

##                            orig.ident nCount_RNA nFeature_RNA dataset.label mito.fraction
## G1003_A_T_AAACCTGAGTATCGAA      G1003       6271         2252     G1003_A_T    0.03364695
## G1003_A_T_AAACCTGCAGTCCTTC      G1003      12496         3440     G1003_A_T    0.04793534
## G1003_A_T_AAACCTGTCAGTTTGG      G1003      11705         3734     G1003_A_T    0.10081162
## G1003_A_T_AAACCTGTCGGAATCT      G1003       7791         2931     G1003_A_T    0.12347581
## G1003_A_T_AAACCTGTCGTTTAGG      G1003       8277         3083     G1003_A_T    0.09423704
## G1003_A_T_AAACGGGAGAAACCGC      G1003       5297         2415     G1003_A_T    0.06890693
##                              dataset nCount_SCT nFeature_SCT integrated_snn_res.0.5
## G1003_A_T_AAACCTGAGTATCGAA G1003_A_T       7200         2252                      2
## G1003_A_T_AAACCTGCAGTCCTTC G1003_A_T       8470         3270                      8
## G1003_A_T_AAACCTGTCAGTTTGG G1003_A_T       8599         3666                     11
## G1003_A_T_AAACCTGTCGGAATCT G1003_A_T       7779         2931                      1
## G1003_A_T_AAACCTGTCGTTTAGG G1003_A_T       8026         3083                      1
## G1003_A_T_AAACGGGAGAAACCGC G1003_A_T       6973         2417                      1
##                            seurat_clusters
## G1003_A_T_AAACCTGAGTATCGAA               2
## G1003_A_T_AAACCTGCAGTCCTTC               8
## G1003_A_T_AAACCTGTCAGTTTGG              11
## G1003_A_T_AAACCTGTCGGAATCT               1
## G1003_A_T_AAACCTGTCGTTTAGG               1
## G1003_A_T_AAACGGGAGAAACCGC               1

seurat.object.integrated <- AddMetaData(object = seurat.object.integrated, metadata = CellPropertiesFromMetadata)

for (property in MetedataPropsToPlot) {
  plots <- DimPlot(seurat.object.integrated, group.by = property, combine = F, reduction = DimRedMethodPlots, label = T)
  plots <- lapply(X = plots, FUN = function(x) x + theme(legend.position = "top", legend.text.align = 0) + labs(title = property) +
                    guides(color = guide_legend(ncol = 5, override.aes = list(size = 3))))

  OutfilePdf <- paste0(PathForOutfiles, "/", PrefixOutfiles, "_", DimensionReductionMethods[[DimRedMethodPlots]][["name"]], "Plot_Metadata_", property, ".pdf")
  pdf(file=OutfilePdf, width = 7, height = 7)
  print(plots)
  dev.off()
  print(plots)
}

## [[1]]

## 
## [[1]]

## 
## [[1]]

## 
## [[1]]

####################################
### Colour dimension reduction plots by global cell-clusters, subsetting each dataset
####################################
for (datasetID in unique(seurat.object.integrated@meta.data$dataset)) {
  Idents(seurat.object.integrated) <- "dataset"
  seurat.object.integrated.subset <- subset(x = seurat.object.integrated, idents = datasetID)

  plots <- DimPlot(seurat.object.integrated.subset, group.by = "seurat_clusters", combine = F, reduction = DimRedMethodPlots, label = T)
  plots <- lapply(X = plots, FUN = function(x) x + theme(legend.position = "top", legend.text.align = 0)
                  + labs(title = paste0("seurat_clusters - ", datasetID))
                  + guides(color = guide_legend(ncol = 7, override.aes = list(size = 3))))

  OutfilePdf<-paste0(PathForOutfiles, "/", PrefixOutfiles, "_", DimensionReductionMethods[[DimRedMethodPlots]][["name"]], "Plot_", datasetID, "_GlobalClustering_ColourByCellClusters.pdf")
  pdf(file=OutfilePdf, width = 7, height = 7)
  print(plots)
  dev.off()
  print(plots)
}

## [[1]]

## 
## [[1]]

## 
## [[1]]

## 
## [[1]]

## 
## [[1]]

## 
## [[1]]

## 
## [[1]]

## 
## [[1]]

## 
## [[1]]

## 
## [[1]]

## 
## [[1]]

## 
## [[1]]

1.0.20 GENERATE DOT PLOTS

####################################
### Dot plots: genes (x-axis) cell-properties (y-axis), all cells
####################################
Idents(seurat.object.integrated) <- "seurat_clusters"
plots <- DotPlot(seurat.object.integrated, features = SelectedGenes, assay = ASSAY, cluster.idents = T)
plots <- plots + RotatedAxis()

OutfilePdf<-paste0(PathForOutfiles, "/", PrefixOutfiles, "_", "DotPlots_AllCells_SelectedGenes.pdf")
pdf(file=OutfilePdf, width = 7, height = 7)
print(plots)
dev.off()

## quartz_off_screen 
##                 2

print(plots)

####################################
### Dot plots: genes (x-axis) cell-properties (y-axis), each dataset cells
####################################
Idents(seurat.object.integrated) <- "seurat_clusters"
plots <- DotPlot(seurat.object.integrated, features = SelectedGenes, idents = ClustersForDotPlot2,
                 cols = c("cornflowerblue", "darkolivegreen4", "chocolate1", "aquamarine3", "burlywood3", 
                          "azure4", "brown1"), dot.scale = 4, split.by = "dataset")
plots <- plots + RotatedAxis()


OutfilePdf<-paste0(PathForOutfiles, "/", PrefixOutfiles, "_", "DotPlots_EachDatasetCells", ".pdf")
pdf(file=OutfilePdf, width = 7, height = 9)
print(plots)
dev.off()

## quartz_off_screen 
##                 2

print(plots)

1.0.21 GET AVERAGE GENE EXPRESSION PER CLUSTER

####################################
### Get average gene expression for each global cluster
####################################
Idents(object = seurat.object.integrated) <- "seurat_clusters"

cluster.averages <- AverageExpression(object = seurat.object.integrated, use.scale = F, use.counts = F, assays = ASSAY)

## The following functions and any applicable methods accept the dots: CreateSeuratObject

Outfile.con <- bzfile(paste0(PathForOutfiles, "/", PrefixOutfiles, "_AverageGeneExpression_GlobalClustering.tsv.bz2"), "w")
Headers<-paste("AVERAGE_GENE_EXPRESSION", paste("c", colnames(cluster.averages[[ASSAY]]), sep="", collapse="\t"), sep="\t", collapse = "\t")
write.table(Headers, file = Outfile.con, row.names = F, col.names = F, sep="\t", quote = F)
write.table(data.frame(cluster.averages[[ASSAY]]), file = Outfile.con, row.names = T, col.names = F, sep="\t", quote = F, append = T)
close(Outfile.con)

1.0.22 SAVE INTEGRATED R_OBJECT

####################################
### Saving integrated R object
####################################
OutfileRDS<-paste0(PathForOutfiles, "/", PrefixOutfiles, "_PCA_Clustering_DimReduction.rds")
saveRDS(seurat.object.integrated, file = OutfileRDS)

1.0.23 OBTAIN COMPUTING TIME

####################################
### Obtain computing time used
####################################
StopWatchEnd$Overall  <- Sys.time()

OutfileCPUtimes<-paste0(PathForOutfiles, "/", PrefixOutfiles, "_PCA_Clustering_DimReduction_CPUtimes.txt")

write(file = OutfileCPUtimes, x = paste("#Number_of_cores_used", NumbCoresToUse, sep = "\t", collapse = ""))
write(file = OutfileCPUtimes, x = paste("#MaxGlobalVariables", MaxGlobalVariables, sep = "\t", collapse = ""), append = T)

Headers<-paste("Step", "Time(minutes)", sep="\t")
write.table(Headers, file = OutfileCPUtimes, row.names = F, col.names = F, sep="\t", quote = F, append = T)

lapply(names(StopWatchStart), function(stepToClock) {
  if (regexpr("POSIXct", class(StopWatchStart[[stepToClock]]), ignore.case = T)[1] == 1) {
    TimeStart <- StopWatchStart[[stepToClock]]
    TimeEnd   <- StopWatchEnd[[stepToClock]]
    TimeDiff <- format(difftime(TimeEnd, TimeStart, units = "min"))
    ReportTime<-c(paste(stepToClock, TimeDiff, sep = "\t", collapse = ""))
    write(file = OutfileCPUtimes, x=gsub(pattern = " mins", replacement = "", x = ReportTime), append = T)
  }
})

## [[1]]
## NULL

1.0.24 FINISH

####################################
### Finish
####################################
options(warn = oldw)
writeLines(paste0("\nEND - Check:\n", PathForOutfiles, "\nFor outfiles\n\n"))

## 
## END - Check:
## /Users/jdiazmej/CourseData/CAN_data/Module7/OUTFILES/Richards_NatCancer_2021/STACAS/WITHOUT_G945_I_T
## For outfiles

# quit()

Javier Diaz - javier.diazmejia@gmail.com
Script based on: https://satijalab.org/seurat/articles/integration_introduction.html
GENERAL OVERVIEW OF THIS SCRIPT
1. Loads integrated datasets R object produced by script CBW_CAN_2021_Module7_Lab3_PCA_Clustering_DimReduction.R
2. Computes differential gene expression (DGE) for each cell-class vs. the rest of cells
3. Computes DGE for one cell-class vs. another cell-class

1.0.25 DEFINE ENVIRONMENT, INPUTS AND OUTDIR

oldw <- getOption("warn")
options( warn = -1 )

1.0.25.0.0.1 Required libraries

library(Seurat)  # (CRAN) main scRNA-seq analysis package
library(future)  # (CRAN) to run parallel processes

1.0.25.0.0.2 User’s home and stopwatch to time run

UserHomeDirectory <- Sys.getenv("HOME")[[1]]
print(UserHomeDirectory)

## [1] "/Users/jdiazmej"

StopWatchStart <- list()
StopWatchEnd   <- list()

StopWatchStart$Overall  <- Sys.time()

1.0.25.0.0.3 Define inputs

### Run specific inputs and parameters

AnchorFinder        <- "STACAS"
PathToIntegratedRds <- paste0("~/CourseData/CAN_data/Module7/OUTFILES/Richards_NatCancer_2021/", AnchorFinder, "/WITHOUT_G945_I_T", "/Richards_NatCancer_2021_PCA_Clustering_DimReduction.rds") ### /path_to/*rds (R object) from CBW_CAN_2021_Module7_Lab3_PCA_Clustering_DimReduction.R
InfileMetadata      <- "~/CourseData/CAN_data/Module7/METADATA/Richards_NatCancer_2021_without_G945_I_T.metadata.tsv"
PathToIntegratedRds <- gsub("^~/",paste0(UserHomeDirectory,"/"), PathToIntegratedRds)
ASSAY               <- "SCT" ### Note, there is debate in the field about whether DGE should be calculated using RNA or SCT

1.0.25.0.0.4 Define outputs

### Outputs
PathForOutfiles     <- paste0("~/CourseData/CAN_data/Module7/OUTFILES/Richards_NatCancer_2021/", AnchorFinder, "/WITHOUT_G945_I_T") ## /path_to/out_directory
PrefixOutfiles      <- "Richards_NatCancer_2021"  ## Prefix for outfiles
PathForOutfiles     <- gsub("^~/",paste0(UserHomeDirectory,"/"), PathForOutfiles)
dir.create(path = PathForOutfiles, recursive = T, showWarnings = F)

1.0.25.0.0.5 Define default parameters

### Default parameters. Either suggested by Seurat developers, or tailored empirically.
DefaultParameters <- list(
  
  ### Parameters for DGE calculation OneVsRest
  Test_OneVsRest               = "wilcox",
  OnlyPos_OneVsRest            = F,
  ReturnMinPctThresh_OneVsRest = 0,  # Use 0 if all cells should be included
  ReturnPvalThresh_OneVsRest   = 1,  # Use 1 if NO p-value cutoff should be used
  ReturnLogFcThresh_OneVsRest  = 0,  # Use 0 if NO LogFc cutoff should be used

  ### Parameters for DGE calculation Paired
  Test_Paired                  = "wilcox",
  OnlyPos_Paired               = F,
  ReturnMinPctThresh_Paired    = 0,  # Use 0 if all cells should be included
  ReturnPvalThresh_Paired      = 1,  # Use 1 if NO p-value cutoff should be used
  ReturnLogFcThresh_Paired     = 0,  # Use 0 if NO LogFc cutoff should be used
  subclass1_Paired             = "malignant",
  subclass2_Paired             = "oligodendrocyte"
)

1.0.25.0.0.6 Report R sessionInfo

OutfileRSessionInfo<-paste0(PathForOutfiles, "/", PrefixOutfiles, "_SingleCell_4_DGE_RSessionInfo.txt")
writeLines(capture.output(sessionInfo()), OutfileRSessionInfo)
capture.output(sessionInfo())

##  [1] "R version 4.0.2 (2020-06-22)"                                                                                           
##  [2] "Platform: x86_64-apple-darwin17.0 (64-bit)"                                                                             
##  [3] "Running under: macOS High Sierra 10.13.6"                                                                               
##  [4] ""                                                                                                                       
##  [5] "Matrix products: default"                                                                                               
##  [6] "BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib"
##  [7] "LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib"                                    
##  [8] ""                                                                                                                       
##  [9] "locale:"                                                                                                                
## [10] "[1] en_CA.UTF-8/en_CA.UTF-8/en_CA.UTF-8/C/en_CA.UTF-8/en_CA.UTF-8"                                                      
## [11] ""                                                                                                                       
## [12] "attached base packages:"                                                                                                
## [13] "[1] parallel  stats4    stats     graphics  grDevices utils     datasets  methods   base     "                          
## [14] ""                                                                                                                       
## [15] "other attached packages:"                                                                                               
## [16] " [1] DropletUtils_1.8.0          SingleCellExperiment_1.10.1 SummarizedExperiment_1.18.2"                               
## [17] " [4] DelayedArray_0.14.1         matrixStats_0.59.0          Biobase_2.48.0             "                               
## [18] " [7] GenomicRanges_1.40.0        GenomeInfoDb_1.24.2         IRanges_2.22.2             "                               
## [19] "[10] S4Vectors_0.26.1            BiocGenerics_0.34.0         ggplot2_3.3.3              "                               
## [20] "[13] STACAS_1.1.0                future_1.21.0               SeuratObject_4.0.1         "                               
## [21] "[16] Seurat_4.0.2               "                                                                                       
## [22] ""                                                                                                                       
## [23] "loaded via a namespace (and not attached):"                                                                             
## [24] "  [1] plyr_1.8.6             igraph_1.2.6           lazyeval_0.2.2         splines_4.0.2         "                      
## [25] "  [5] BiocParallel_1.22.0    listenv_0.8.0          scattermore_0.7        digest_0.6.27         "                      
## [26] "  [9] htmltools_0.5.1.1      fansi_0.5.0            magrittr_2.0.1         tensor_1.5            "                      
## [27] " [13] cluster_2.1.2          ROCR_1.0-11            limma_3.44.3           globals_0.14.0        "                      
## [28] " [17] R.utils_2.10.1         spatstat.sparse_2.0-0  colorspace_2.0-1       ggrepel_0.9.1         "                      
## [29] " [21] xfun_0.23              dplyr_1.0.6            crayon_1.4.1           RCurl_1.98-1.3        "                      
## [30] " [25] jsonlite_1.7.2         spatstat.data_2.1-0    survival_3.2-11        zoo_1.8-9             "                      
## [31] " [29] glue_1.4.2             polyclip_1.10-0        gtable_0.3.0           zlibbioc_1.34.0       "                      
## [32] " [33] XVector_0.28.0         leiden_0.3.8           Rhdf5lib_1.10.1        future.apply_1.7.0    "                      
## [33] " [37] HDF5Array_1.16.1       abind_1.4-5            scales_1.1.1           edgeR_3.30.3          "                      
## [34] " [41] DBI_1.1.1              miniUI_0.1.1.1         Rcpp_1.0.6             viridisLite_0.4.0     "                      
## [35] " [45] xtable_1.8-4           dqrng_0.3.0            reticulate_1.20        spatstat.core_2.1-2   "                      
## [36] " [49] htmlwidgets_1.5.3      httr_1.4.2             RColorBrewer_1.1-2     ellipsis_0.3.2        "                      
## [37] " [53] ica_1.0-2              R.methodsS3_1.8.1      pkgconfig_2.0.3        farver_2.1.0          "                      
## [38] " [57] sass_0.4.0             uwot_0.1.10            deldir_0.2-10          locfit_1.5-9.4        "                      
## [39] " [61] utf8_1.2.1             tidyselect_1.1.1       labeling_0.4.2         rlang_0.4.11          "                      
## [40] " [65] reshape2_1.4.4         later_1.2.0            munsell_0.5.0          tools_4.0.2           "                      
## [41] " [69] generics_0.1.0         ggridges_0.5.3         evaluate_0.14          stringr_1.4.0         "                      
## [42] " [73] fastmap_1.1.0          yaml_2.2.1             goftest_1.2-2          knitr_1.33            "                      
## [43] " [77] fitdistrplus_1.1-5     purrr_0.3.4            RANN_2.6.1             packrat_0.6.0         "                      
## [44] " [81] pbapply_1.4-3          nlme_3.1-152           mime_0.10              R.oo_1.24.0           "                      
## [45] " [85] compiler_4.0.2         rstudioapi_0.13        plotly_4.9.3           png_0.1-7             "                      
## [46] " [89] spatstat.utils_2.1-0   tibble_3.1.2           bslib_0.2.5.1          stringi_1.6.2         "                      
## [47] " [93] highr_0.9              RSpectra_0.16-0        lattice_0.20-44        Matrix_1.3-3          "                      
## [48] " [97] vctrs_0.3.8            pillar_1.6.1           lifecycle_1.0.0        spatstat.geom_2.1-0   "                      
## [49] "[101] lmtest_0.9-38          jquerylib_0.1.4        RcppAnnoy_0.0.18       data.table_1.14.0     "                      
## [50] "[105] cowplot_1.1.1          bitops_1.0-7           irlba_2.3.3            httpuv_1.6.1          "                      
## [51] "[109] patchwork_1.1.1        R6_2.5.0               bookdown_0.22          promises_1.2.0.1      "                      
## [52] "[113] KernSmooth_2.23-20     gridExtra_2.3          parallelly_1.25.0      codetools_0.2-18      "                      
## [53] "[117] MASS_7.3-54            assertthat_0.2.1       rhdf5_2.32.4           withr_2.4.2           "                      
## [54] "[121] sctransform_0.3.2      GenomeInfoDbData_1.2.3 mgcv_1.8-35            grid_4.0.2            "                      
## [55] "[125] rpart_4.1-15           tidyr_1.1.3            rmarkdown_2.8          Rtsne_0.15            "                      
## [56] "[129] shiny_1.6.0           "

1.0.25.0.0.7 Define number of cores and RAM for parallelization

NumbCores <- "MAX"
MaxGlobalVariables <- 30000

if (regexpr("^MAX$", NumbCores, ignore.case = T)[1] == 1) {
  NumbCoresToUse <- availableCores()[[1]]
}else if (regexpr("^[0-9]+$", NumbCores, ignore.case = T)[1] == 1) {
  NumbCoresToUse <- as.numeric(NumbCores)
}else{
  stop(paste0("Unexpected format for NumbCores: ", NumbCores, "\n"))
}

if (NumbCoresToUse == 1) {
  plan(strategy = "sequential")
  writeLines(paste0("\n", "*** Running: in 'sequential' mode with ", NumbCoresToUse, " core ***", "\n"))
}else if (NumbCoresToUse > 1) {
  plan(strategy = "multicore", workers = NumbCoresToUse)
  writeLines(paste0("\n", "*** Running: in 'multicore' mode with ", NumbCoresToUse, " cores ***", "\n"))
}else{
  stop(paste0("Unexpected NumbCores = ", NumbCoresToUse))
}

## 
## *** Running: in 'multicore' mode with 4 cores ***

options(future.globals.maxSize = MaxGlobalVariables * 1024^2)

1.0.26 LOAD R OBJECT

####################################
### Load R Object
####################################
seurat.object.integrated <- readRDS(PathToIntegratedRds)

1.0.27 LOAD METADATA

####################################
### Loading metadata from --infile_metadata
####################################
CellPropertiesFromMetadata <- data.frame(read.table(InfileMetadata, header = T, row.names = 1, check.names = F))
head(CellPropertiesFromMetadata)

##                            cell_cluster cell_cluster.cell_type cell_type found_in_cell_cluster
## G1003_A_T_AAACCTGAGTATCGAA            1           01.malignant malignant                     1
## G1003_A_T_AAACCTGCAGTCCTTC            9           09.malignant malignant                     1
## G1003_A_T_AAACCTGTCAGTTTGG           11           11.malignant malignant                     1
## G1003_A_T_AAACCTGTCGGAATCT            2           02.malignant malignant                     1
## G1003_A_T_AAACCTGTCGTTTAGG            2           02.malignant malignant                     1
## G1003_A_T_AAACGGGAGAAACCGC            2           02.malignant malignant                     1
##                            dataset_id monochrome
## G1003_A_T_AAACCTGAGTATCGAA  G1003_A_T          1
## G1003_A_T_AAACCTGCAGTCCTTC  G1003_A_T          1
## G1003_A_T_AAACCTGTCAGTTTGG  G1003_A_T          1
## G1003_A_T_AAACCTGTCGGAATCT  G1003_A_T          1
## G1003_A_T_AAACCTGTCGTTTAGG  G1003_A_T          1
## G1003_A_T_AAACGGGAGAAACCGC  G1003_A_T          1

1.0.28 FIND DGE USING GLOBAL CELL CLUSTERS DIMENSIONS

Idents(object = seurat.object.integrated) <- "seurat_clusters"

### For details on this approach to use a pseudocount, see https://f1000research.com/articles/7-1522
FindMarkers.Pseudocount  <- 1/length(rownames(seurat.object.integrated@meta.data))

###  base = exp(1) is needed in Seurat v4 because the default uses Log2FC instead of LogNatFC, like v3 does
seurat.object.integrated.markers <- FindAllMarkers(object = seurat.object.integrated,
                                                   assay = ASSAY,
                                                   test.use = DefaultParameters$Test_OneVsRest,
                                                   only.pos = DefaultParameters$OnlyPos_OneVsRest,
                                                   min.pct =  DefaultParameters$ReturnMinPctThresh_OneVsRest,
                                                   return.thresh =   DefaultParameters$ReturnPvalThresh_OneVsRest,
                                                   logfc.threshold = DefaultParameters$ReturnLogFcThresh_OneVsRest,
                                                   pseudocount.use = FindMarkers.Pseudocount,
                                                   base = exp(1))

## Calculating cluster 0

## Calculating cluster 1

## Calculating cluster 2

## Calculating cluster 3

## Calculating cluster 4

## Calculating cluster 5

## Calculating cluster 6

## Calculating cluster 7

## Calculating cluster 8

## Calculating cluster 9

## Calculating cluster 10

## Calculating cluster 11

## Calculating cluster 12

## Calculating cluster 13

### IMPORTANT: use column 'gene' instead of rownames
head(seurat.object.integrated.markers)

##        p_val avg_logFC pct.1 pct.2 p_val_adj cluster   gene
## IL1A       0  3.255033 0.213 0.013         0       0   IL1A
## CCL4       0  3.249802 0.955 0.308         0       0   CCL4
## CCL3L3     0  3.231801 0.922 0.180         0       0 CCL3L3
## INHBA      0  3.090476 0.275 0.024         0       0  INHBA
## CCL4L2     0  3.040385 0.851 0.179         0       0 CCL4L2
## CCL3       0  2.974216 0.987 0.407         0       0   CCL3

SimplifiedDiffExprGenes.df <- seurat.object.integrated.markers[,c("cluster","gene","p_val","p_val_adj","avg_logFC","pct.1","pct.2")]

### Add -Log10(Pval) * sign of FC
SimplifiedDiffExprGenes.df[["mLog10Pval_FCsign"]] <- (log10(SimplifiedDiffExprGenes.df[["p_val"]])*-1) * sign(SimplifiedDiffExprGenes.df[,"avg_logFC"])
### Replace -Inf and Inf in mLog10Pval_FCsign
SimplifiedDiffExprGenes.df[,"mLog10Pval_FCsign"][SimplifiedDiffExprGenes.df[,"mLog10Pval_FCsign"] ==  Inf]  <- log10(min(SimplifiedDiffExprGenes.df[,"p_val"][SimplifiedDiffExprGenes.df[,"p_val"] > 0]))*-1
SimplifiedDiffExprGenes.df[,"mLog10Pval_FCsign"][SimplifiedDiffExprGenes.df[,"mLog10Pval_FCsign"] ==  -Inf] <- log10(max(SimplifiedDiffExprGenes.df[,"p_val"][SimplifiedDiffExprGenes.df[,"p_val"] > 0]))*-1
head(SimplifiedDiffExprGenes.df)

##        cluster   gene p_val p_val_adj avg_logFC pct.1 pct.2 mLog10Pval_FCsign
## IL1A         0   IL1A     0         0  3.255033 0.213 0.013          307.2867
## CCL4         0   CCL4     0         0  3.249802 0.955 0.308          307.2867
## CCL3L3       0 CCL3L3     0         0  3.231801 0.922 0.180          307.2867
## INHBA        0  INHBA     0         0  3.090476 0.275 0.024          307.2867
## CCL4L2       0 CCL4L2     0         0  3.040385 0.851 0.179          307.2867
## CCL3         0   CCL3     0         0  2.974216 0.987 0.407          307.2867

OutfileDGE <- paste0(PathForOutfiles, "/", PrefixOutfiles, "_DGE_GlobalClustering_", ASSAY, "_", DefaultParameters$Test_OneVsRest, ".tsv.bz2")
Outfile.con <- bzfile(OutfileDGE, "w")
write.table(x = data.frame(SimplifiedDiffExprGenes.df), file = Outfile.con, row.names = F, sep="\t", quote = F)
close(Outfile.con)

1.0.29 FIND DGE USING PAIRED METADATA CLASSES

Idents(seurat.object.integrated) <- "cell_type"
FindMarkers.Pseudocount  <- 1/length(rownames(seurat.object.integrated@meta.data))
###  base = exp(1) is needed in Seurat v4 because the default uses Log2FC instead of LogNatFC, like v3 does
seurat.object.integrated.markers <- data.frame(FindMarkers(object = seurat.object.integrated,
                                                           assay = ASSAY,
                                                           test.use = DefaultParameters$Test_Paired,
                                                           only.pos = DefaultParameters$OnlyPos_Paired,
                                                           ident.1  = DefaultParameters$subclass1_Paired,
                                                           ident.2  = DefaultParameters$subclass2_Paired,
                                                           min.pct  = DefaultParameters$ReturnMinPctThresh_Paired,
                                                           return.thresh   = DefaultParameters$ReturnPvalThresh_Paired,
                                                           logfc.threshold = DefaultParameters$ReturnLogFcThresh_Paired,
                                                           pseudocount.use = FindMarkers.Pseudocount,
                                                           base = exp(1)))
seurat.object.integrated.markers$class1 <- DefaultParameters$subclass1_Paired
seurat.object.integrated.markers$class2 <- DefaultParameters$subclass2_Paired
seurat.object.integrated.markers$gene   <- rownames(seurat.object.integrated.markers)

SimplifiedDiffExprGenes.df <- seurat.object.integrated.markers[,c("class1","class2","gene","p_val","p_val_adj","avg_logFC","pct.1","pct.2")]

### Add -Log10(Pval) * sign of FC
SimplifiedDiffExprGenes.df[["mLog10Pval_FCsign"]] <- (log10(SimplifiedDiffExprGenes.df[["p_val"]])*-1) * sign(SimplifiedDiffExprGenes.df[,"avg_logFC"])
### Replace -Inf and Inf in mLog10Pval_FCsign
SimplifiedDiffExprGenes.df[,"mLog10Pval_FCsign"][SimplifiedDiffExprGenes.df[,"mLog10Pval_FCsign"] ==  Inf]  <- log10(min(SimplifiedDiffExprGenes.df[,"p_val"][SimplifiedDiffExprGenes.df[,"p_val"] > 0]))*-1
SimplifiedDiffExprGenes.df[,"mLog10Pval_FCsign"][SimplifiedDiffExprGenes.df[,"mLog10Pval_FCsign"] ==  -Inf] <- log10(max(SimplifiedDiffExprGenes.df[,"p_val"][SimplifiedDiffExprGenes.df[,"p_val"] > 0]))*-1
head(SimplifiedDiffExprGenes.df)

##            class1          class2    gene p_val p_val_adj  avg_logFC pct.1 pct.2 mLog10Pval_FCsign
## VWA1    malignant oligodendrocyte    VWA1     0         0 -1.9362850 0.202 0.757            0.0000
## RBP7    malignant oligodendrocyte    RBP7     0         0 -5.0060315 0.005 0.510            0.0000
## CAMK2N1 malignant oligodendrocyte CAMK2N1     0         0 -2.4241825 0.289 0.930            0.0000
## RPL11   malignant oligodendrocyte   RPL11     0         0  0.9726906 0.998 0.986          307.2163
## YBX1    malignant oligodendrocyte    YBX1     0         0  1.5010289 0.991 0.873          307.2163
## TMEM125 malignant oligodendrocyte TMEM125     0         0 -5.6157419 0.005 0.637            0.0000

OutfileDGE <- paste0(PathForOutfiles, "/", PrefixOutfiles, "_DGE_Paired_", ASSAY, "_", DefaultParameters$Test_Paired, ".tsv.bz2")
Outfile.con <- bzfile(OutfileDGE, "w")
write.table(x = data.frame(SimplifiedDiffExprGenes.df), file = Outfile.con, row.names = F, sep="\t", quote = F)
close(Outfile.con)

1.0.30 OBTAIN COMPUTING TIME

####################################
### Obtain computing time used
####################################
StopWatchEnd$Overall  <- Sys.time()
OutfileCPUtimes<-paste0(PathForOutfiles, "/", PrefixOutfiles, "_DGE_CPUtimes.txt")
write(file = OutfileCPUtimes, x = paste("#Number_of_cores_used", NumbCoresToUse, sep = "\t", collapse = ""))
write(file = OutfileCPUtimes, x = paste("#MaxGlobalVariables", MaxGlobalVariables, sep = "\t", collapse = ""), append = T)

Headers<-paste("Step", "Time(minutes)", sep="\t")
write.table(Headers, file = OutfileCPUtimes, row.names = F, col.names = F, sep="\t", quote = F, append = T)

lapply(names(StopWatchStart), function(stepToClock) {
  if (regexpr("POSIXct", class(StopWatchStart[[stepToClock]]), ignore.case = T)[1] == 1) {
    TimeStart <- StopWatchStart[[stepToClock]]
    TimeEnd   <- StopWatchEnd[[stepToClock]]
    TimeDiff <- format(difftime(TimeEnd, TimeStart, units = "min"))
    ReportTime<-c(paste(stepToClock, TimeDiff, sep = "\t", collapse = ""))
    write(file = OutfileCPUtimes, x=gsub(pattern = " mins", replacement = "", x = ReportTime), append = T)
  }
})

## [[1]]
## NULL

1.0.31 FINISH

####################################
### Finish
####################################
options(warn = oldw)
writeLines(paste0("\nEND - Check:\n", PathForOutfiles, "\nFor outfiles\n\n"))

## 
## END - Check:
## /Users/jdiazmej/CourseData/CAN_data/Module7/OUTFILES/Richards_NatCancer_2021/STACAS/WITHOUT_G945_I_T
## For outfiles

# quit()

CBW_2021_CAN_M7 Single Cell RNA

Chapter 1 Down the rabbit-hole

1.0.1 DEFINE ENVIRONMENT, INPUTS AND OUTDIR

1.0.1.0.0.1 Required libraries

1.0.1.0.0.2 User’s home and stopwatch to time run

1.0.1.0.0.3 Define inputs

1.0.1.0.0.4 Define outputs

1.0.1.0.0.5 Define default parameters

1.0.1.0.0.6 Report R sessionInfo

1.0.2 LOAD DATASETS

1.0.3 QC DATASETS

1.0.4 NORMALIZE DATASETS

1.0.5 SAVE EACH DATASET R_OBJECT

1.0.6 OBTAIN COMPUTING TIME

1.0.7 FINISH

1.0.8 DEFINE ENVIRONMENT, INPUTS AND OUTDIR

1.0.8.0.0.1 Required libraries

1.0.8.0.0.2 User’s home and stopwatch to time run

1.0.8.0.0.3 Define inputs

1.0.8.0.0.4 Define outputs

1.0.8.0.0.5 Define default parameters

1.0.8.0.0.6 Report R sessionInfo

1.0.8.0.0.7 Define number of cores and RAM for parallelization

1.0.9 LOAD R OBJECTS

1.0.10 INTEGRATE DATASETS

1.0.11 INTEGRATE DATASETS

1.0.12 SAVE INTEGRATED R_OBJECT

1.0.13 OBTAIN COMPUTING TIME

1.0.14 FINISH

1.0.15 DEFINE ENVIRONMENT, INPUTS AND OUTDIR

1.0.15.0.0.1 Required libraries

1.0.15.0.0.2 User’s home and stopwatch to time run

1.0.15.0.0.3 Define inputs

1.0.15.0.0.4 Define outputs

1.0.15.0.0.5 Define default parameters

1.0.15.0.0.6 Report R sessionInfo

1.0.15.0.0.7 Define number of cores and RAM for parallelization

1.0.16 LOAD R OBJECT

1.0.17 REDUCE DIMENSIONS

1.0.18 CLUSTER CELLS

1.0.19 GENERATE UMAP PLOTS

1.0.20 GENERATE DOT PLOTS

1.0.21 GET AVERAGE GENE EXPRESSION PER CLUSTER

1.0.22 SAVE INTEGRATED R_OBJECT

1.0.23 OBTAIN COMPUTING TIME

1.0.24 FINISH

1.0.25 DEFINE ENVIRONMENT, INPUTS AND OUTDIR

1.0.25.0.0.1 Required libraries

1.0.25.0.0.2 User’s home and stopwatch to time run

1.0.25.0.0.3 Define inputs

1.0.25.0.0.4 Define outputs

1.0.25.0.0.5 Define default parameters

1.0.25.0.0.6 Report R sessionInfo

1.0.25.0.0.7 Define number of cores and RAM for parallelization

1.0.26 LOAD R OBJECT

1.0.27 LOAD METADATA

1.0.28 FIND DGE USING GLOBAL CELL CLUSTERS DIMENSIONS

1.0.29 FIND DGE USING PAIRED METADATA CLASSES

1.0.30 OBTAIN COMPUTING TIME

1.0.31 FINISH