{ "cells": [ { "cell_type": "markdown", "id": "4cc6acb5", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "Directly build the SingleCellExperiment object loadings in [Tutorial 01](./01-Preprocess-Expression.ipynb) and the Seurat Object loaded in [Supplementary Tutorial X](./SX_Preprocess_Expression_Seurat.ipynb):" ] }, { "cell_type": "code", "execution_count": 33, "id": "106218d1", "metadata": { "vscode": { "languageId": "r" }, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Attaching SeuratObject\n", "\n", "\n", "Attaching package: ‘Seurat’\n", "\n", "\n", "The following object is masked from ‘package:SummarizedExperiment’:\n", "\n", " Assays\n", "\n", "\n" ] } ], "source": [ "library(Seurat, quietly = T)\n", "library(SingleCellExperiment, quietly = T)\n", "\n", "# paths\n", "data.path<-'../../data/'\n", "covid.input.path<-paste0(data.path, 'raw/covid_balf/')" ] }, { "cell_type": "markdown", "id": "c3521778", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "#### Loading" ] }, { "cell_type": "markdown", "id": "2b0e5a63", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "The 12 samples can be downloaded as .h5 files from [here](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE145926). You can also download the cell metadata from [here](https://raw.githubusercontent.com/zhangzlab/covid_balf/master/all.cell.annotation.meta.txt)\n", "\n", "We download these files directly in the proceeding cell:" ] }, { "cell_type": "code", "execution_count": null, "id": "80279064", "metadata": { "vscode": { "languageId": "r" }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# download the metadata\n", "metadata.link <- 'https://raw.githubusercontent.com/zhangzlab/covid_balf/master/all.cell.annotation.meta.txt'\n", "cmd <- paste0('wget ', metadata.link, ' -O ', covid.input.path, 'metadata.txt')\n", "system(cmd, ignore.stdout = T, ignore.stderr = T)\n", "\n", "# download the expression data\n", "sample.links <- c(\n", " 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4339nnn/GSM4339769/suppl/GSM4339769%5FC141%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5',\n", " 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4339nnn/GSM4339770/suppl/GSM4339770%5FC142%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5',\n", " 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4339nnn/GSM4339771/suppl/GSM4339771%5FC143%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5', \n", " 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4339nnn/GSM4339772/suppl/GSM4339772%5FC144%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5', \n", " 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4339nnn/GSM4339773/suppl/GSM4339773%5FC145%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5',\n", " 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4339nnn/GSM4339774/suppl/GSM4339774%5FC146%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5', \n", " 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4475nnn/GSM4475048/suppl/GSM4475048%5FC51%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5', \n", " 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4475nnn/GSM4475049/suppl/GSM4475049%5FC52%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5', \n", " 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4475nnn/GSM4475050/suppl/GSM4475050%5FC100%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5', \n", " 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4475nnn/GSM4475051/suppl/GSM4475051%5FC148%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5', \n", " 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4475nnn/GSM4475052/suppl/GSM4475052%5FC149%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5',\n", " 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4475nnn/GSM4475053/suppl/GSM4475053%5FC152%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5'\n", " )\n", "\n", "for (sl in sample.links){\n", " cmd <- paste0('wget ', sl, ' -P ', covid.input.path)\n", " system(cmd, ignore.stdout = T, ignore.stderr = T)\n", "}" ] }, { "cell_type": "markdown", "id": "680948dd", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "We can then format the downloaded files:" ] }, { "cell_type": "code", "execution_count": null, "id": "a84f6d6a", "metadata": { "vscode": { "languageId": "r" }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# format the metadata\n", "md <- read.table(paste0(covid.input.path, 'metadata.txt'), header = T, row.names = 'ID')\n", "colnames(md) = c('Sample.ID', 'sample_new', 'Condition', 'disease', 'hasnCoV', 'cluster', 'cell.type')\n", "\n", "condition.map = c('Control', 'Moderate COVID-19', 'Severe COVID-19')\n", "names(condition.map) <- c('HC', 'M', 'S')\n", "md['Condition'] <- unname(condition.map[md$Condition])\n", "md$Condition <- factor(md$Condition, levels = condition.map)\n", "\n", "md<-md[md$Sample.ID != 'GSM3660650', ] # drop the non-scRNAseq dataset included in this file\n", "\n", "sample.order<-c('C100', 'C144', 'C149', 'C51', 'C141', 'C145', 'C152', 'C143', 'C142', 'C146', 'C148', 'C52')\n", "md$Sample.ID <- factor(md$Sample.ID, levels = sample.order)\n", "\n", "md<-md[with(md, order(Sample.ID)), ]\n", "\n", "colnames(md)<-c('sample', 'sample_new', 'condition', 'disease', 'hasnCoV', 'cluster', 'cell.type')\n", "md<-md[c('sample', 'sample_new', 'disease', 'hasnCoV', 'cluster', 'cell.type', 'condition')]" ] }, { "cell_type": "code", "execution_count": null, "id": "bf232616", "metadata": { "vscode": { "languageId": "r" }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "balf.samples<-list()\n", "\n", "suppressMessages({\n", " suppressWarnings({\n", " for (filename in list.files(covid.input.path)){\n", " if (endsWith(filename, '.h5')){\n", " sample<-unlist(strsplit(filename, '_'))[[2]]\n", "\n", " # subset and format metadata\n", " md.sample<-md[md[['sample']] == sample,]\n", " rownames(md.sample) <- unname(sapply(rownames(md.sample), \n", " function(x) paste0(unlist(strsplit(x, '_'))[[1]], '-1')))\n", " # load the counts\n", " so <- Seurat::Read10X_h5(filename=paste0(covid.input.path, filename), unique.features=T)\n", " so <- so[, rownames(md.sample)] # only include cells present in the metadata\n", "\n", " # preprocess\n", " so <- CreateSeuratObject(counts=so, project=sample, meta.data=md.sample) \n", " balf.samples[[sample]] <- so\n", " }\n", " } \n", " })\n", "})" ] }, { "cell_type": "markdown", "id": "416f19e2", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "Merge into a single file" ] }, { "cell_type": "code", "execution_count": null, "id": "f09f06d2", "metadata": { "vscode": { "languageId": "r" }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "balf.samples<-balf.samples[sample.order]\n", "covid_data<-merge(balf.samples[[1]], y = balf.samples[2:length(balf.samples)], \n", " project = \"balf.covid\")\n", "covid_data<-covid_data[rownames(covid_data) != 'nCoV', ]\n", "\n", "saveRDS(covid_data, paste0(data.path, 'BALF-COVID19-Liao_et_al-NatMed-2020.rds'))" ] }, { "cell_type": "markdown", "id": "37df248f", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "Convert into SingleCellExperiment" ] }, { "cell_type": "code", "execution_count": 19, "id": "74893b9b", "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# covid_data<-readRDS('/data/hratch/ccc_protocols/raw/BALF-COVID19-Liao_et_al-NatMed-2020.rds')\n", "covid_data_sce<-Seurat::as.SingleCellExperiment(covid_data)\n", "covid_data_sce@colData<-covid_data_sce@colData[!(colnames(covid_data_sce@colData) %in% c('nCount_RNA', 'nFeature_RNA'))]\n", "assay(covid_data_sce, 'logcounts')<-NULL\n", "saveRDS(covid_data_sce, paste0(data.path, 'BALF-COVID19-Liao_et_al-NatMed-2020_SCE.rds'))" ] } ], "metadata": { "kernelspec": { "display_name": "R [conda env:ccc_protocols]", "language": "R", "name": "conda-env-ccc_protocols-r" }, "language_info": { "codemirror_mode": "r", "file_extension": ".r", "mimetype": "text/x-r-source", "name": "R", "pygments_lexer": "r", "version": "4.2.2" } }, "nbformat": 4, "nbformat_minor": 5 }