This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
library(dada2); packageVersion("dada2")
Loading required package: Rcpp
[1] ‘1.28.0’
path <- "Data" # CHANGE ME to the directory containing the fastq files after unzipping.
list.files(path)
[1] "ena-file-download-read_run-PRJNA489752-fastq_ftp-20251004-1440.sh"
[2] "SRR7820324_1.fastq.gz"
[3] "SRR7820324_2.fastq.gz"
[4] "SRR7820325_2.fastq.gz"
[5] "SRR7820326_1.fastq.gz"
[6] "SRR7820326_2.fastq.gz"
[7] "SRR7820327_1.fastq.gz"
[8] "SRR7820327_2.fastq.gz"
[9] "SRR7820328_1.fastq.gz"
[10] "SRR7820328_2.fastq.gz"
[11] "SRR7820329_2.fastq.gz"
[12] "SRR7820330_1.fastq.gz"
[13] "SRR7820331_1.fastq.gz"
[14] "SRR7820331_2.fastq.gz"
[15] "SRR7820332_1.fastq.gz"
[16] "SRR7820332_2.fastq.gz"
[17] "SRR7820333_1.fastq.gz"
[18] "SRR7820333_2.fastq.gz"
[19] "SRR7820334_1.fastq.gz"
[20] "SRR7820335_1.fastq.gz"
[21] "SRR7820335_2.fastq.gz"
[22] "SRR7820336_1.fastq.gz"
[23] "SRR7820337_1.fastq.gz"
[24] "SRR7820337_2.fastq.gz"
[25] "SRR7820338_1.fastq.gz"
[26] "SRR7820338_2.fastq.gz"
[27] "SRR7820339_1.fastq.gz"
[28] "SRR7820339_2.fastq.gz"
[29] "SRR7820340_2.fastq.gz"
[30] "SRR7820341_1.fastq.gz"
[31] "SRR7820341_2.fastq.gz"
[32] "SRR7820342_1.fastq.gz"
[33] "SRR7820342_2.fastq.gz"
[34] "SRR7820343_1.fastq.gz"
[35] "SRR7820343_2.fastq.gz"
[36] "SRR7820344_2.fastq.gz"
[37] "SRR7820345_1.fastq.gz"
[38] "SRR7820345_2.fastq.gz"
[39] "SRR7820346_1.fastq.gz"
[40] "SRR7820346_2.fastq.gz"
[41] "SRR7820347_1.fastq.gz"
[42] "SRR7820347_2.fastq.gz"
[43] "SRR7820348_1.fastq.gz"
[44] "SRR7820348_2.fastq.gz"
[45] "SRR7820349_1.fastq.gz"
[46] "SRR7820349_2.fastq.gz"
[47] "SRR7820350_1.fastq.gz"
[48] "SRR7820350_2.fastq.gz"
[49] "SRR7820351_1.fastq.gz"
[50] "SRR7820351_2.fastq.gz"
[51] "SRR7820352_1.fastq.gz"
[52] "SRR7820352_2.fastq.gz"
[53] "SRR7820353_1.fastq.gz"
[54] "SRR7820354_1.fastq.gz"
[55] "SRR7820354_2.fastq.gz"
[56] "SRR7820355_1.fastq.gz"
[57] "SRR7820356_1.fastq.gz"
[58] "SRR7820356_2.fastq.gz"
[59] "SRR7820357_1.fastq.gz"
[60] "SRR7820357_2.fastq.gz"
[61] "SRR7820358_1.fastq.gz"
[62] "SRR7820358_2.fastq.gz"
[63] "SRR7820359_1.fastq.gz"
[64] "SRR7820360_1.fastq.gz"
[65] "SRR7820360_2.fastq.gz"
[66] "SRR7820361_1.fastq.gz"
[67] "SRR7820361_2.fastq.gz"
[68] "SRR7820362_1.fastq.gz"
[69] "SRR7820362_2.fastq.gz"
[70] "SRR7820363_2.fastq.gz"
[71] "SRR7820364_1.fastq.gz"
[72] "SRR7820364_2.fastq.gz"
[73] "SRR7820365_2.fastq.gz"
[74] "SRR7820366_1.fastq.gz"
[75] "SRR7820366_2.fastq.gz"
[76] "SRR7820367_1.fastq.gz"
[77] "SRR7820367_2.fastq.gz"
[78] "SRR7820368_1.fastq.gz"
[79] "SRR7820368_2.fastq.gz"
[80] "SRR7820369_2.fastq.gz"
[81] "SRR7820370_1.fastq.gz"
[82] "SRR7820370_2.fastq.gz"
[83] "SRR7820371_1.fastq.gz"
[84] "SRR7820371_2.fastq.gz"
[85] "SRR7820372_1.fastq.gz"
[86] "SRR7820373_1.fastq.gz"
[87] "SRR7820373_2.fastq.gz"
# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq
fnFs <- sort(list.files(path, pattern="_1.fastq", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="_2.fastq", full.names = TRUE))
# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
plotQualityProfile(fnFs[1:2])
plotQualityProfile(fnRs[1:2])
# Place filtered files in filtered/ subdirectory
filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(240,160),
maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE,
compress=TRUE, multithread=TRUE) # On Windows set multithread=FALSE
Creating output directory: Data/filtered
head(out)
reads.in reads.out
SRR7820324_1.fastq 161427 138580
SRR7820325_1.fastq 144452 122765
SRR7820326_1.fastq 167665 145579
SRR7820327_1.fastq 171871 146349
SRR7820328_1.fastq 167836 145566
SRR7820329_1.fastq 174471 152123
errF <- learnErrors(filtFs, multithread=TRUE)
132785520 total bases in 553273 reads from 4 samples will be used for learning the error rates.
errR <- learnErrors(filtRs, multithread=TRUE)
111814240 total bases in 698839 reads from 5 samples will be used for learning the error rates.
plotErrors(errF, nominalQ=TRUE)
dadaFs <- dada(filtFs, err=errF, multithread=TRUE)
Sample 1 - 138580 reads in 32734 unique sequences.
Sample 2 - 122765 reads in 40661 unique sequences.
Sample 3 - 145579 reads in 45402 unique sequences.
Sample 4 - 146349 reads in 30802 unique sequences.
Sample 5 - 145566 reads in 46280 unique sequences.
Sample 6 - 152123 reads in 48910 unique sequences.
Sample 7 - 128984 reads in 39512 unique sequences.
Sample 8 - 128799 reads in 38728 unique sequences.
Sample 9 - 165814 reads in 47366 unique sequences.
Sample 10 - 157413 reads in 51310 unique sequences.
Sample 11 - 136103 reads in 49931 unique sequences.
Sample 12 - 164377 reads in 47139 unique sequences.
Sample 13 - 132612 reads in 35995 unique sequences.
Sample 14 - 152455 reads in 54117 unique sequences.
Sample 15 - 187524 reads in 47416 unique sequences.
Sample 16 - 163962 reads in 47483 unique sequences.
Sample 17 - 188748 reads in 35247 unique sequences.
Sample 18 - 142973 reads in 33314 unique sequences.
Sample 19 - 149837 reads in 44288 unique sequences.
Sample 20 - 152948 reads in 25533 unique sequences.
Sample 21 - 158102 reads in 56099 unique sequences.
Sample 22 - 211695 reads in 44311 unique sequences.
Sample 23 - 216274 reads in 42730 unique sequences.
Sample 24 - 199047 reads in 39050 unique sequences.
Sample 25 - 159729 reads in 38885 unique sequences.
Sample 26 - 171072 reads in 55478 unique sequences.
Sample 27 - 151917 reads in 34206 unique sequences.
Sample 28 - 226287 reads in 46514 unique sequences.
Sample 29 - 190763 reads in 44623 unique sequences.
Sample 30 - 153286 reads in 46742 unique sequences.
Sample 31 - 207065 reads in 39561 unique sequences.
Sample 32 - 167336 reads in 40105 unique sequences.
Sample 33 - 152003 reads in 57490 unique sequences.
Sample 34 - 185893 reads in 37068 unique sequences.
Sample 35 - 140685 reads in 22992 unique sequences.
Sample 36 - 131721 reads in 42234 unique sequences.
Sample 37 - 166740 reads in 60565 unique sequences.
Sample 38 - 164620 reads in 44922 unique sequences.
Sample 39 - 168504 reads in 31582 unique sequences.
Sample 40 - 185000 reads in 30981 unique sequences.
Sample 41 - 166184 reads in 26471 unique sequences.
Sample 42 - 154384 reads in 26755 unique sequences.
Sample 43 - 171138 reads in 28590 unique sequences.
Sample 44 - 158249 reads in 36161 unique sequences.
Sample 45 - 164411 reads in 33948 unique sequences.
Sample 46 - 169899 reads in 35073 unique sequences.
Sample 47 - 131614 reads in 54519 unique sequences.
Sample 48 - 151162 reads in 56851 unique sequences.
Sample 49 - 134897 reads in 43440 unique sequences.
Sample 50 - 198648 reads in 39044 unique sequences.
dadaRs <- dada(filtRs, err=errR, multithread=TRUE)
Sample 1 - 138580 reads in 29259 unique sequences.
Sample 2 - 122765 reads in 32654 unique sequences.
Sample 3 - 145579 reads in 38297 unique sequences.
Sample 4 - 146349 reads in 27236 unique sequences.
Sample 5 - 145566 reads in 36352 unique sequences.
Sample 6 - 152123 reads in 39132 unique sequences.
Sample 7 - 128984 reads in 30237 unique sequences.
Sample 8 - 128799 reads in 26762 unique sequences.
Sample 9 - 165814 reads in 39691 unique sequences.
Sample 10 - 157413 reads in 36194 unique sequences.
Sample 11 - 136103 reads in 33579 unique sequences.
Sample 12 - 164377 reads in 38075 unique sequences.
Sample 13 - 132612 reads in 32235 unique sequences.
Sample 14 - 152455 reads in 38326 unique sequences.
Sample 15 - 187524 reads in 38777 unique sequences.
Sample 16 - 163962 reads in 36975 unique sequences.
Sample 17 - 188748 reads in 33752 unique sequences.
Sample 18 - 142973 reads in 24933 unique sequences.
Sample 19 - 149837 reads in 31845 unique sequences.
Sample 20 - 152948 reads in 25454 unique sequences.
Sample 21 - 158102 reads in 44562 unique sequences.
Sample 22 - 211695 reads in 43823 unique sequences.
Sample 23 - 216274 reads in 44791 unique sequences.
Sample 24 - 199047 reads in 36700 unique sequences.
Sample 25 - 159729 reads in 33759 unique sequences.
Sample 26 - 171072 reads in 48312 unique sequences.
Sample 27 - 151917 reads in 34048 unique sequences.
Sample 28 - 226287 reads in 44591 unique sequences.
Sample 29 - 190763 reads in 33149 unique sequences.
Sample 30 - 153286 reads in 41856 unique sequences.
Sample 31 - 207065 reads in 37385 unique sequences.
Sample 32 - 167336 reads in 39057 unique sequences.
Sample 33 - 152003 reads in 49197 unique sequences.
Sample 34 - 185893 reads in 36384 unique sequences.
Sample 35 - 140685 reads in 24553 unique sequences.
Sample 36 - 131721 reads in 37521 unique sequences.
Sample 37 - 166740 reads in 49885 unique sequences.
Sample 38 - 164620 reads in 44028 unique sequences.
Sample 39 - 168504 reads in 30831 unique sequences.
Sample 40 - 185000 reads in 32577 unique sequences.
Sample 41 - 166184 reads in 29594 unique sequences.
Sample 42 - 154384 reads in 28971 unique sequences.
Sample 43 - 171138 reads in 27063 unique sequences.
Sample 44 - 158249 reads in 34661 unique sequences.
Sample 45 - 164411 reads in 33113 unique sequences.
Sample 46 - 169899 reads in 30442 unique sequences.
Sample 47 - 131614 reads in 39801 unique sequences.
Sample 48 - 151162 reads in 47824 unique sequences.
Sample 49 - 134897 reads in 39466 unique sequences.
Sample 50 - 198648 reads in 40564 unique sequences.
dadaFs[[1]]
dada-class: object describing DADA2 denoising results
557 sequence variants were inferred from 32734 input unique sequences.
Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE)
14 paired-reads (in 8 unique pairings) successfully merged out of 136134 (in 20538 pairings) input.
0 paired-reads (in 0 unique pairings) successfully merged out of 118151 (in 25022 pairings) input.
20 paired-reads (in 10 unique pairings) successfully merged out of 140624 (in 32048 pairings) input.
15 paired-reads (in 6 unique pairings) successfully merged out of 144191 (in 13768 pairings) input.
9 paired-reads (in 5 unique pairings) successfully merged out of 140632 (in 29392 pairings) input.
15 paired-reads (in 11 unique pairings) successfully merged out of 147259 (in 35257 pairings) input.
5 paired-reads (in 5 unique pairings) successfully merged out of 124486 (in 27490 pairings) input.
16 paired-reads (in 11 unique pairings) successfully merged out of 125030 (in 22837 pairings) input.
12 paired-reads (in 7 unique pairings) successfully merged out of 161066 (in 31557 pairings) input.
38 paired-reads (in 27 unique pairings) successfully merged out of 152297 (in 33945 pairings) input.
2 paired-reads (in 1 unique pairings) successfully merged out of 130933 (in 30259 pairings) input.
7 paired-reads (in 5 unique pairings) successfully merged out of 159925 (in 26783 pairings) input.
0 paired-reads (in 0 unique pairings) successfully merged out of 129569 (in 29155 pairings) input.
0 paired-reads (in 0 unique pairings) successfully merged out of 147312 (in 35836 pairings) input.
6 paired-reads (in 6 unique pairings) successfully merged out of 183994 (in 27446 pairings) input.
20 paired-reads (in 5 unique pairings) successfully merged out of 159691 (in 28380 pairings) input.
38 paired-reads (in 3 unique pairings) successfully merged out of 185949 (in 17236 pairings) input.
5 paired-reads (in 1 unique pairings) successfully merged out of 140724 (in 15244 pairings) input.
7 paired-reads (in 6 unique pairings) successfully merged out of 146383 (in 24237 pairings) input.
83 paired-reads (in 3 unique pairings) successfully merged out of 151964 (in 8390 pairings) input.
5 paired-reads (in 5 unique pairings) successfully merged out of 151824 (in 41544 pairings) input.
11293 paired-reads (in 355 unique pairings) successfully merged out of 208339 (in 12797 pairings) input.
8074 paired-reads (in 214 unique pairings) successfully merged out of 212969 (in 11726 pairings) input.
8090 paired-reads (in 179 unique pairings) successfully merged out of 196554 (in 8989 pairings) input.
9210 paired-reads (in 214 unique pairings) successfully merged out of 157543 (in 7227 pairings) input.
4 paired-reads (in 4 unique pairings) successfully merged out of 166369 (in 46777 pairings) input.
13697 paired-reads (in 279 unique pairings) successfully merged out of 149856 (in 6123 pairings) input.
9543 paired-reads (in 212 unique pairings) successfully merged out of 223330 (in 13643 pairings) input.
7020 paired-reads (in 192 unique pairings) successfully merged out of 189431 (in 8346 pairings) input.
2 paired-reads (in 2 unique pairings) successfully merged out of 149045 (in 36460 pairings) input.
9944 paired-reads (in 228 unique pairings) successfully merged out of 205149 (in 9459 pairings) input.
13535 paired-reads (in 334 unique pairings) successfully merged out of 164052 (in 7995 pairings) input.
0 paired-reads (in 0 unique pairings) successfully merged out of 146553 (in 43547 pairings) input.
551 paired-reads (in 19 unique pairings) successfully merged out of 182826 (in 16240 pairings) input.
64 paired-reads (in 11 unique pairings) successfully merged out of 139046 (in 7360 pairings) input.
0 paired-reads (in 0 unique pairings) successfully merged out of 128255 (in 33462 pairings) input.
0 paired-reads (in 0 unique pairings) successfully merged out of 161352 (in 45379 pairings) input.
8 paired-reads (in 5 unique pairings) successfully merged out of 159973 (in 42076 pairings) input.
563 paired-reads (in 16 unique pairings) successfully merged out of 166224 (in 8395 pairings) input.
93 paired-reads (in 14 unique pairings) successfully merged out of 183155 (in 17611 pairings) input.
533 paired-reads (in 17 unique pairings) successfully merged out of 164562 (in 6247 pairings) input.
279 paired-reads (in 8 unique pairings) successfully merged out of 152641 (in 6601 pairings) input.
7 paired-reads (in 3 unique pairings) successfully merged out of 169500 (in 16524 pairings) input.
404 paired-reads (in 14 unique pairings) successfully merged out of 154634 (in 17286 pairings) input.
274 paired-reads (in 7 unique pairings) successfully merged out of 161596 (in 13664 pairings) input.
121 paired-reads (in 13 unique pairings) successfully merged out of 167096 (in 16695 pairings) input.
10 paired-reads (in 9 unique pairings) successfully merged out of 125312 (in 38071 pairings) input.
2 paired-reads (in 2 unique pairings) successfully merged out of 145042 (in 45364 pairings) input.
0 paired-reads (in 0 unique pairings) successfully merged out of 130446 (in 41367 pairings) input.
8251 paired-reads (in 176 unique pairings) successfully merged out of 196439 (in 7846 pairings) input.
# Inspect the merger data.frame from the first sample
head(mergers[[1]])
seqtab <- makeSequenceTable(mergers)
dim(seqtab)
[1] 50 1215
# Inspect distribution of sequence lengths
table(nchar(getSequences(seqtab)))
240 248 249 252 253 255 256 258 259 260 262 263 264 265 266 268 269 272 273 274 275 276 277 279 283 284
1 1 3 1 2 6 1 1 1 1 3 2 1 6 2 1 1 1 1 11 12 1 1 1 2 4
285 286 287 289 290 291 293 294 295 296 297 298 302 303 304 305 306 311 312 313 315 316 317 319 320 321
17 63 5 108 18 12 3 5 7 10 9 14 2 2 1 7 50 2 3 1 1 2 5 1 3 1
322 323 324 325 326 328 329 333 334 336 337 339 340 341 342 343 345 346 347 348 350 351 352 353 354 355
1 2 3 4 2 6 8 2 2 4 3 1 3 2 4 1 3 9 4 1 5 1 4 1 5 5
356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
7 2 8 32 3 8 10 7 4 7 17 4 29 6 12 4 6 40 6 105 22 9 33 20 40 7
382 383 384 385 386 387 388
99 29 22 33 20 47 11
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
Identified 521 bimeras out of 1215 input sequences.
dim(seqtab.nochim)
[1] 50 694
sum(seqtab.nochim)/sum(seqtab)
[1] 0.6708379
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim))
# If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
head(track)
input filtered denoisedF denoisedR merged nonchim
SRR7820324 161427 138580 136888 137779 14 14
SRR7820325 144452 122765 119828 120995 0 0
SRR7820326 167665 145579 142329 143788 20 19
SRR7820327 171871 146349 144955 145548 15 14
SRR7820328 167836 145566 142073 144041 9 7
SRR7820329 174471 152123 148933 150382 15 11
taxa <- assignTaxonomy(seqtab.nochim, "silva_nr_v132_train_set.fa.gz", multithread=TRUE)
taxa.print <- taxa # Removing sequence rownames for display only
rownames(taxa.print) <- NULL
head(taxa.print)
Kingdom Phylum Class Order Family Genus
[1,] "Eukaryota" "Euglenozoa" NA NA NA NA
[2,] "Eukaryota" "Euglenozoa" NA NA NA NA
[3,] "Eukaryota" "Euglenozoa" NA NA NA NA
[4,] "Eukaryota" "Euglenozoa" NA NA NA NA
[5,] "Eukaryota" "Euglenozoa" NA NA NA NA
[6,] "Eukaryota" "Euglenozoa" NA NA NA NA