## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(echo = TRUE) ## ----eval=FALSE--------------------------------------------------------------- # # install.packages("devtools") # devtools::install_github("cristianetaniguti/Qploidy") ## ----------------------------------------------------------------------------- library(Qploidy) # Simulate a VCF file with 500 markers and 60 samples simulate_vcf( seed = 1234, file_path = "vcf_example_simulated.vcf", n_tetraploid = 35, n_diploid = 5, n_triploid = 10, n_markers = 500 ) ## ----------------------------------------------------------------------------- data <- qploidy_read_vcf(vcf_file = "vcf_example_simulated.vcf") head(data) ## ----echo=FALSE--------------------------------------------------------------- temp1 <- tempfile(fileext = ".txt") simulate_axiom_summary(seed = 1234, file_path = temp1) temp1 <- read.table(temp1, header = TRUE) head(temp1) ## ----eval=FALSE--------------------------------------------------------------- # data <- read_axiom("AxiomGT1_summary.txt") # head(data) ## ----eval=FALSE--------------------------------------------------------------- # data <- read_illumina_array( # "data/illumina/Set1.txt", # This is a example code, data not available # "data/illumina/Set2.txt", # "data/illumina/Set3.txt" # ) # head(input_data) ## ----------------------------------------------------------------------------- # All samples all_samples <- unique(data$SampleName) all_samples tetraploid_samples <- all_samples[grep("Tetraploid", all_samples)] tetraploid_samples ## ----eval=FALSE--------------------------------------------------------------- # data_reference <- data ## ----------------------------------------------------------------------------- genos <- qploidy_read_vcf("vcf_example_simulated.vcf", geno = TRUE) dim(genos) genos.pos <- qploidy_read_vcf("vcf_example_simulated.vcf", geno.pos = TRUE) head(genos.pos) ## ----eval=FALSE--------------------------------------------------------------- # genos.pos$MarkerName <- paste0(genos.pos$Chromosome, "_", genos.pos$Position) # head(genos.pos) ## ----------------------------------------------------------------------------- genos <- genos[which(genos$SampleName %in% tetraploid_samples), ] ## ----------------------------------------------------------------------------- library(tidyr) # Prepare inputs for updog ## Approach (1) # tobe_genotyped <- data[which(data$SampleName %in% tetraploid_samples),] ## Approach (2) tobe_genotyped <- data ref <- pivot_wider(tobe_genotyped[, 1:3], names_from = SampleName, values_from = X) ref <- as.matrix(ref) rownames_ref <- ref[, 1] ref <- ref[, -1] ref <- apply(ref, 2, as.numeric) rownames(ref) <- rownames_ref size <- pivot_wider(tobe_genotyped[, c(1, 2, 5)], names_from = SampleName, values_from = R) size <- as.matrix(size) rownames_size <- size[, 1] size <- size[, -1] size <- apply(size, 2, as.numeric) rownames(size) <- rownames_size size[1:10, 1:10] ref[1:10, 1:10] ## ----------------------------------------------------------------------------- library(updog) multidog_obj <- multidog( refmat = ref, sizemat = size, model = "norm", # It depends of your population structure ploidy = 4, # Most common ploidy in your population nc = 6 ) # Change the parameters accordingly!! genos <- data.frame( MarkerName = multidog_obj$inddf$snp, SampleName = multidog_obj$inddf$ind, geno = multidog_obj$inddf$geno, prob = multidog_obj$inddf$maxpostprob ) head(genos) ## ----eval=FALSE--------------------------------------------------------------- # library(fitPoly) # saveMarkerModels( # ploidy = 4, # Most common ploidy among Texas Garden Roses collection # data = data_reference, # output of the previous function # filePrefix = "fitpoly_output/texas_roses", # Define the path and prefix for the result files # ncores = 2 # ) # Change it accordingly to your system!!!! # # library(vroom) # Package to speed up reading files # fitpoly_scores <- vroom("fitpoly_output/texas_roses_scores.dat") # # genos <- data.frame( # MarkerName = fitpoly_scores$MarkerName, # SampleName = fitpoly_scores$SampleName, # geno = fitpoly_scores$maxgeno, # prob = fitpoly_scores$maxP # ) # head(genos) ## ----eval=FALSE--------------------------------------------------------------- # # File containing markers genomic positions # genos.pos_file <- read.table("geno.pos_roses_texas.txt", header = T) # head(genos.pos) ## ----eval=FALSE--------------------------------------------------------------- # # Edit for Qploidy input format # genos.pos <- data.frame( # MarkerName = genos.pos_file$probes, # Chromosome = genos.pos_file$chr, # Position = genos.pos_file$pos # ) # head(genos.pos) ## ----echo=FALSE--------------------------------------------------------------- temp_file <- tempfile(fileext = ".tsv.gz") # saving in a temporary file simu_data_standardized <- standardize( data = data, genos = genos, geno.pos = genos.pos, ploidy.standardization = 4, threshold.n.clusters = 5, n.cores = 1, out_filename = temp_file, verbose = TRUE ) simu_data_standardized ## ----eval=FALSE--------------------------------------------------------------- # simu_data_standardized <- standardize( # data = data, # genos = genos, # geno.pos = genos.pos, # ploidy.standardization = 4, # threshold.n.clusters = 5, # n.cores = 1, # out_filename = "my_standardized_data.tsv.gz", # verbose = TRUE # ) # # simu_data_standardized ## ----echo=FALSE--------------------------------------------------------------- simu_data_standardized <- read_qploidy_standardization(temp_file) ## ----eval=FALSE--------------------------------------------------------------- # simu_data_standardized <- read_qploidy_standardization("my_standardized_data.tsv.gz") ## ----------------------------------------------------------------------------- # Select a sample to be evaluated samples <- unique(simu_data_standardized$data$SampleName) head(samples) ## ----------------------------------------------------------------------------- sample <- "Tetraploid1" # Proportion of heterozygous loci, BAF (Qploidy standardized ratio), and zscore by genomic positions p <- plot_qploidy_standardization( x = simu_data_standardized, sample = sample, type = c("het", "BAF", "zscore"), dot.size = 0.05, chr = 1:2 ) p ## ----------------------------------------------------------------------------- # Heterozygous frequency, BAF (Qploidy standardized ratio), and zscore by genomic positions # centromere positions added p <- plot_qploidy_standardization( x = simu_data_standardized, sample = sample, type = c("het", "BAF", "zscore"), dot.size = 0.05, chr = 1:2, add_centromeres = TRUE, centromeres = c("chr1" = 15000000, "chr2" = 19000000) ) p ## ----------------------------------------------------------------------------- # Raw allele intensity or read count ratio and BAF (Qploidy standardized ratio) histograms # combining all markers in the sample (sample level resolution) p <- plot_qploidy_standardization( x = simu_data_standardized, sample = sample, type = c("Ratio_hist_overall", "BAF_hist_overall"), chr = 1:2, ploidy = 4, add_expected_peaks = TRUE ) p ## ----------------------------------------------------------------------------- # BAF histograms (chromosome level resolution) and Z score p <- plot_qploidy_standardization( x = simu_data_standardized, sample = sample, type = c("BAF_hist", "zscore"), chr = 1:2, add_expected_peaks = TRUE, ploidy = c(4, 4) ) p ## ----------------------------------------------------------------------------- # BAF histograms combining all markers in the sample (chromosome-arm level resolution) and Z score p <- plot_qploidy_standardization( x = simu_data_standardized, sample = sample, type = c("BAF_hist", "zscore"), chr = 1:2, ploidy = c(4, 4, 4, 4), # Provide ploidy for each arm add_expected_peaks = TRUE, add_centromeres = TRUE, centromeres = c("chr1" = 15000000, "chr2" = 19000000) ) p ## ----------------------------------------------------------------------------- # If sample level resolution estimated_ploidies_sample <- area_estimate_ploidy( qploidy_standardization = simu_data_standardized, # standardization result object samples = "all", # Samples or "all" to estimate all samples level = "sample", # Resolution level ploidies = c(2, 3, 4, 5) ) # Ploidies to be tested estimated_ploidies_sample ## ----------------------------------------------------------------------------- head(estimated_ploidies_sample$ploidy) ## ----------------------------------------------------------------------------- # If chromosome resolution estimated_ploidies_chromosome <- area_estimate_ploidy( qploidy_standardization = simu_data_standardized, samples = "all", level = "chromosome", ploidies = c(2, 3, 4, 5) ) estimated_ploidies_chromosome ## ----------------------------------------------------------------------------- head(estimated_ploidies_chromosome$ploidy) ## ----------------------------------------------------------------------------- # If chromosome-arm resolution estimated_ploidies_chromosome_arm <- area_estimate_ploidy( qploidy_standardization = simu_data_standardized, samples = "all", level = "chromosome-arm", ploidies = c(2, 3, 4, 5), centromeres = c("chr1" = 15000000, "chr2" = 19000000) ) estimated_ploidies_chromosome_arm ## ----------------------------------------------------------------------------- head(estimated_ploidies_chromosome_arm$ploidy) ## ----------------------------------------------------------------------------- estimated_ploidies_format <- merge_arms_format(estimated_ploidies_chromosome_arm) estimated_ploidies_format ## ----------------------------------------------------------------------------- head(estimated_ploidies_format$ploidy) ## ----eval=FALSE--------------------------------------------------------------- # write.csv(estimated_ploidies_format$ploidy, file = "ploidies_before_visual_evaluation.csv") ## ----eval=FALSE--------------------------------------------------------------- # dir.create("figures") # # for (i in seq_along(samples)) { # print(paste0("Part:", i, "/", length(samples))) # print(paste("Generating figures for", samples[i], "...")) # # This function will generate figures for sample, chromosome and chromosome-arm resolution level evaluation # all_resolutions_plots( # data_standardized = simu_data_standardized, # sample = samples[i], # ploidy = estimated_ploidies_chromosome$ploidy[i, 1:2], # chr = 1:2, # centromeres = c("chr1" = 15000000, "chr2" = 19000000), # file_name = paste0("figures/", samples[i]) # ) # print(paste("Done!")) # } ## ----------------------------------------------------------------------------- ploidy_corrected <- estimated_ploidies_chromosome$ploidy ## ----eval=FALSE--------------------------------------------------------------- # # i) Select the highest resolution plots from the first standardization round. # tetraploids <- apply(ploidy_corrected, 1, function(x) all(x == 4)) # tetraploids # # data_reference2 <- rownames(ploidy_corrected)[which(tetraploids)] # length(data_reference2) # # # ii) Filter these samples from the `data` object. # genos_round2 <- genos[which(genos$SampleName %in% data_reference2), ] # # # iii) Run standardization again using this subset as the reference. # simu_data_standardized_round2 <- standardize( # data = data, # genos = genos_round2, # geno.pos = genos.pos, # ploidy.standardization = 4, # threshold.n.clusters = 5, # n.cores = 1, # out_filename = "my_round2_standardization.tsv.gz", # verbose = TRUE # ) # simu_data_standardized_round2