Introduction to RNA Sequencing Bioinformatics

R scripts for differential expression

These scripts are used to calculate differential expression using featurecounts data

# set to work directory: make sure to set directory to project folder. Change whuo01 to your own username below
setwd("/cluster/tufts/bio/tools/training/intro-to-rnaseq/users/whuo01/intro-to-RNA-seq/")

# load shared Tufts bio library path; if you don't have access to tufts HPC, skip this step
.libPaths('/cluster/tufts/bio/tools/R_libs/3.5')

# load required libraries
library(DESeq2)
library(vsn)
library(dplyr)
library(tidyverse)
library(ggrepel)
library(DEGreport)
library(pheatmap)


# load data.
meta <- read.table("raw_data/sample_info.txt", header=TRUE)
feature_count <- read.table("./featurecounts/featurecounts_results.mod.txt",
                            header=TRUE, row.names = 1)
# remove first 6 columns by select the column 6 to 19
data <- feature_count[,6:19]

# check to make sure that all rows labels in meta are columns in data
all(colnames(data) == rownames(meta))

# Differntial expression using DESeq2
dds <- DESeqDataSetFromMatrix(countData = data, colData = meta, design = ~ condition)
dds <- DESeq(dds)
# Creating contrasts and running a Wald test
contrast <- c("condition", "SNF2", "WT")
res_unshrunken <- results(dds, contrast=contrast)
summary(res_unshrunken)
# Shrinkage of the log2 fold changes
res <- lfcShrink(dds, contrast=contrast, res=res_unshrunken)
summary(res)

# Quality control: Principle Components Analysis
# regularized log transformation (rlog)
rld <- rlog(dds, blind=TRUE)
plotPCA(rld, intgroup="condition") + geom_text(aes(label=name))

# Filtering to find significant genes using FDR cutoff of 0.05
padj.cutoff <- 0.05 # False Discovery Rate cutoff
significant_results <- res[which(res$padj < padj.cutoff),]
# save results using customized file_name
file_name = 'significant_padj_0.05.txt'
write.table(significant_results, file_name, quote=FALSE)



# Visualization

# simple plot for a single gene YOR290C (SNF2)
plotCounts(dds, gene="YOR290C", intgroup="condition")

# heatmap  
# plot top 50 genes in a heatmap: top 50 with most significant padj value
significant_results_sorted <- significant_results[order(significant_results$padj), ]
significant_genes_50 <- rownames(significant_results_sorted[1:50, ])
# extract the counts from the rlog transformed object
rld_counts <- assay(rld)
# select by row name using the list of genes:
rld_counts_sig <- rld_counts[significant_genes_50, ]
# Plot multiple genes in a heatmap:
pheatmap(rld_counts_sig,
         cluster_rows = T,
         show_rownames = T,
         annotation = meta,
         border_color = NA,
         fontsize = 10,
         scale = "row",
         fontsize_row = 8,
         height = 20)


# volcano plot
# load necessary library ggplot2
library(ggplot2)
# add another column in the results table to label the significant genes using threshold of padj<0.05 and absolute value of log2foldchange >=1
res_table <- res %>%
  data.frame() %>%
  rownames_to_column(var="gene") %>%
  as_tibble()
res_table <- res_table %>%
  mutate(threshold_OE =  padj < 0.05 & abs(log2FoldChange) >= 1)
# you can view the modified table
view(res_table)
# make volcano plot, the significant genes will be labeled in red
ggplot(res_table) +
  geom_point(aes(x = log2FoldChange, y = -log10(padj), colour = threshold_OE)) +
  scale_color_manual(values=c("black", "red")) +  # black v.s. red dots
  ggtitle("SNF2 against WT") +                       # this line defines the title of the plot
  xlab("log2 fold change") +                      # this line defines the name of the x-axis
  ylab("-log10 adjusted p-value") +               # name of y-axis
  scale_x_continuous(limits = c(-7.5,7.5)) +      # the axis range is set to be from -7.5 to 7.5
  theme(legend.position = "none", #c(0.9, 0.9),
        plot.title = element_text(size = rel(1.5), hjust = 0.5),
        axis.title = element_text(size = rel(1.25)))




# functional analysis using clusterprofiler
# load library
library(org.Sc.sgd.db)
library(clusterProfiler)

## Run GO enrichment analysis for the top 500 genes
significant_results_sorted <- res[order(res$padj), ]
significant_genes_500 <- rownames(significant_results_sorted[1:500, ])
ego <- enrichGO(gene = significant_genes_500,
                         keyType = "ENSEMBL",
                         OrgDb = org.Sc.sgd.db)

## Output results from GO analysis to a table
cluster_summary <- data.frame(ego)
## Dotplot
dotplot(ego, showCategory=50)
## Enrichmap clusters the 50 most significant (by padj) GO terms to visualize relationships between terms
emapplot(ego, showCategory = 50)

Go back to workshop schedule