#First we need some tools:
#Install htmltools
install.packages("htmltools")
library(htmltools)
#Install BiocManager
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install(version = "3.10")
#Use BiocManager to install DESeq2
BiocManager::install("DESeq2")
#Call out DESeq2 and ggplot2
library("DESeq2")
library(ggplot2)
#Save the URL of the counts table from github as ReadCountsURL
ReadCountsURL <- "https://raw.githubusercontent.com/BTomlinson/Bioinformatics_tutorial/master/AB_UTvT.csv"
#Download the ReadCountsURL file and save it as AB_UTvT.csv
download.file(ReadCountsURL, destfile = "AB_UTvT.csv", method = "auto")
#Read the downloaded file (AB_UTvT.csv) and save it as ReadCountsTable
ReadCountsTable <- read.csv('AB_UTvT.csv', header = TRUE, sep = ",")
#Lets look at the top of the table we just made
head(ReadCountsTable)
#now we need to tell R what sample belongs to which experimental group. In this case, we have a control group (untreated, UT) and a group exposed to antibiotic (treated, T). This information is defined by the metaData table
#Save the URL of the metadata table from github as metaDataURL
metaDataURL <- "https://raw.githubusercontent.com/BTomlinson/Bioinformatics_tutorial/master/AB_UTvT_metadata.csv"
#Download the ReadCountsURL file and save it as AB_UTvT_metadata.csv
download.file(metaDataURL, destfile = "AB_UTvT_metadata.csv", method = "auto")
#Read the downloaded file (AB_UTvT_metadata.csv) and save it as metaDataTable
metaDataTable <- read.csv('AB_UTvT_metadata.csv', header = TRUE, sep = ",")
#Let's take a look at our table
metaDataTable
#The main thing to note about this file is each sample is matched to the appropriate experimental condition (untreated or treated)
#You are also able to define different experimental conditions or even GEO_ID tags in one column should you wish - for now we will simply focus on untreated vs. treated
#Now we need to make the appropriate DESeqDataSet object. Heres how we can construct that from what we just did above:
DataSet <- DESeqDataSetFromMatrix(countData=ReadCountsTable, colData=metaDataTable, design=~condition, tidy = TRUE)
#Design specifies how the counts from each gene depend on our variables in the metadata.
#For this dataset, the factor we care about is our treatment status (condition)
#tidy=TRUE tells DESeq2 to output the results table with rownames as a first #column called 'row.
#OKAY, lets check out the object we just made
DataSet
#This is a good way to make sure your object is constructed correctly.
#As you can see, we made a DESeqDataSet.
#dim: 3981 6 tells us there are 3981 rows and 6 columns
#rownames shows us our rows represent each gene name
#colnames shows us each column represents each experimental sample
#If you would like some light reading on DESeq2 and all of the arguments...
?DESeq
#Excellent speed reading. Now that you are DESeq2 expert, we are ready to run the DESeq function on our DataSet!
DEDataSet <- DESeq(DataSet)
#estimateSizeFactors calculates the relative library depth of each sample
#estimateDispersions estimates the dispersion of counts for each gene
#nbinomWaldTest calculates the significance of coefficients in a Negative Binomial GLM using the size and dispersion outputs
#lets save these results as DESeqResults
DESeqResults <- results(DEDataSet)
#and take a quick peek at them
head(results(DEDataSet, tidy=TRUE))
#You can also get a quick summary of the results table, this is useful for identifying outliers too! But we have a much prettier way of doing this coming up...
summary(DESeqResults)
#Lets re-order our resuls to the most interesting, er I mean significant results are a the top. This sorts the results by adjusted p-value
DESeqResults <- DESeqResults[order(DESeqResults$padj),]
head(DESeqResults)
#we can use the plotCounts function to compare the normalized counts between untreated and treated conditions for a few genes of interest
par(mfrow=c(2,3))
plotCounts(DEDataSet, gene="adeT", intgroup="condition")
plotCounts(DEDataSet, gene="ABUW_0123", intgroup="condition")
plotCounts(DEDataSet, gene="ABUW_0302", intgroup="condition")
plotCounts(DEDataSet, gene="ABUW_0255", intgroup="condition")
plotCounts(DEDataSet, gene="ABUW_2391", intgroup="condition")
#To further decipher the data, I recommend blasting the genes with significantly altered expression to determine what their function is! Or, look into additional tools like PANTHER (http://www.pantherdb.org) or KEGG (https://www.kegg.jp)!
#reset par
par(mfrow=c(1,1))
#How about we make a basic volcano plot to view the differential expression spread?
with(DESeqResults, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))
#We can color these points to see where our significant results lie #blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(DESeqResults, padj<.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(DESeqResults, padj<.01 & abs(log2FoldChange)>2), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))
#You can do a Principal Component Analysis to see how your samples group by condition #This brings out strong patterns from large and complex datasets and samples with similar expression profiles will cluster together #For a great, simplified explanation on PCA - https://blog.bioturing.com/2018/06/14/principal-component-analysis-explained-simply/ #First we transform the raw count data, use the vst function to perform variance stabilizing transformation
VSData <- vst(DESeqResults, blind=FALSE)
#Then, use the DESEQ2 plotPCA function to plot Principal Component Analysis to see how these samples group by condition
plotPCA(VSData, intgroup="condition")