Module 4
Lab 4A
Normalization and Preprocessing
Install and load DESeq2
Create a DESeq2 dataset object
Run the DESeq pipeline
Raw and normalized counts
# Normalized
norm_counts <- counts(dds, normalized=TRUE)
head(norm_counts)
# Raw
norm_counts <- counts(dds, normalized=FALSE)
head(norm_counts)How to use VST in DESeq2
Create a new transformed dataset
vsd <- vst(dds, blind=FALSE) # takes into account the experimental design
# Us the transformed expression matrix
assay(vsd)[1:5, 1:5]Sample Clustering (Dendrogram/Heatmap)
# install pheatmap
BiocManager::install("pheatmap", force = T)
# Load pheatmap
library(pheatmap)
# Calculate sample-to-sample distances
sampleDists <- dist(t(assay(vsd)))
# Convert distances into a matrix
sampleDistMatrix <- as.matrix(sampleDists)
# Heatmap of distances between samples
pheatmap(sampleDistMatrix,
annotation_col = as.data.frame(colData(vsd)[, "dex", drop=FALSE]),
main = "Sample-to-sample distances")
# PCA with DESeq2 VST Data colored by treatment (dex)
plotPCA(vsd, intgroup="dex")Lab 4A Tasks
Pretasks
# Load DESeq2
library(DESeq2)
library(airway)
data("airway")
# Create a DESeq2 dataset object
dds <- DESeqDataSet(airway, design = ~ dex)
# Run the DESeq pipeline
dds <- DESeq(dds)
# creates a new transformed dataset
vsd <- vst(dds, blind=FALSE) # takes into account the experimental designTask 1: Variance check (top variable genes)
# Calculate variance for each gene
geneVars <- rowVars(assay(vsd))
# Top 10 most variable genes
top10 <- head(order(geneVars, decreasing=TRUE), 10)
rownames(vsd)[top10]Task 2: Custom PCA on top 500 genes
library(ggplot2)
# Select top 500 variable genes
top500 <- head(order(geneVars, decreasing=TRUE), 500)
mat <- assay(vsd)[top500, ]
# Run PCA
pca <- prcomp(t(mat), scale. = TRUE)
# Create data frame for plotting
pca_df <- as.data.frame(pca$x)
pca_df$dex <- colData(vsd)$dex
# PCA plot
ggplot(pca_df, aes(x=PC1, y=PC2, color=dex)) +
geom_point(size=3) +
labs(title="PCA on Top 500 Variable Genes")Task 3: Challenge: Add cell line
Lab 4B
Bioconductor Packages and Data sets
Install airway package
DGE Analysis
Extract DGE results
Filter significant genes
Visualization of DGE Results
# Load required packages
library(DESeq2)
library(ggplot2)
# Run DESeq2 analysis
dds <- DESeqDataSet(airway, design = ~ dex)
dds <- DESeq(dds)
res <- results(dds)
# Convert to data frame for plotting
res_df <- as.data.frame(res)
# Remove rows with missing p-values or fold change (optional but helps avoid warnings)
res_df <- na.omit(res_df)
# Create a new column indicating regulation direction
res_df$Regulation <- "Not significant"
res_df$Regulation[res_df$log2FoldChange > 1 & res_df$padj < 0.05] <- "Upregulated"
res_df$Regulation[res_df$log2FoldChange < -1 & res_df$padj < 0.05] <- "Downregulated"Volcano plot
ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), color = Regulation)) +
geom_point(alpha = 0.6, size = 1.5) +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "gray40") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "gray40") +
scale_color_manual(values = c("Upregulated" = "red",
"Downregulated" = "blue",
"Not significant" = "gray70")) +
labs(title = "Volcano Plot: Treated vs Untreated",
x = "log2 Fold Change",
y = "-log10(Adjusted p-value)",
color = "Regulation") +
theme_minimal()Mini Project
Task 1: Load Required Libraries and Dataset
Load packages
library(DESeq2)
library(airway)
library(ggplot2)
library(org.Hs.eg.db)
library(AnnotationDbi)
library(pheatmap)
data("airway"). # Load data
# Inspect dataset
airwayTask 2: Preprocessing and Normalization
# Create DESeq2 object
dds <- DESeqDataSet(airway, design = ~ dex)
# Run the DESeq2 pipeline (includes normalization)
dds <- DESeq(dds)
# Variance Stabilizing Transformation
vsd <- vst(dds, blind = FALSE)Task 3: Differential Expression Analysis
# Extract DE results
res <- results(dds)
summary(res)
# Filter significant genes (adjusted p < 0.05)
sig_res <- res[which(res$padj < 0.05), ]
# Order by log2 fold change
sig_res <- sig_res[order(sig_res$log2FoldChange, decreasing = TRUE), ]
# Show top results
head(sig_res)Task 4: Annotate Significant Genes
# Take top 20 genes
top_genes <- rownames(sig_res)[1:20]
# Map to gene symbols and names
symbols <- mapIds(org.Hs.eg.db,
keys = top_genes,
keytype = "ENSEMBL",
column = "SYMBOL")
names <- mapIds(org.Hs.eg.db,
keys = top_genes,
keytype = "ENSEMBL",
column = "GENENAME")
annotated <- data.frame(ENSEMBL = top_genes,
Symbol = symbols,
Name = names)
head(annotated)Task 5: Visualization
A: PCA
B: Volcano plot
res_df <- as.data.frame(res)
res_df <- na.omit(res_df)
res_df$Regulation <- "Not significant"
res_df$Regulation[res_df$log2FoldChange > 1 & res_df$padj < 0.05] <- "Upregulated"
res_df$Regulation[res_df$log2FoldChange < -1 & res_df$padj < 0.05] <- "Downregulated"
ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), color = Regulation)) +
geom_point(alpha = 0.6, size = 1.5) +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "gray40") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "gray40") +
scale_color_manual(values = c("Upregulated" = "red",
"Downregulated" = "blue",
"Not significant" = "gray70")) +
labs(title = "Volcano Plot: Treated vs Untreated",
x = "log2 Fold Change", y = "-log10(Adjusted p-value)") +
theme_minimal()C: Heatmap of Top Variable Genes
topVarGenes <- head(order(rowVars(assay(vsd)), decreasing = TRUE), 30)
pheatmap(assay(vsd)[topVarGenes, ],
scale = "row",
annotation_col = as.data.frame(colData(vsd)[, "dex", drop=FALSE]),
main = "Top 30 Variable Genes")D: Optional Challenge (Barplot of top 10 upregulated genes)