## ----eval,eval=TRUE----------------------------------------------------------- knitr::opts_chunk$set(echo=TRUE,eval=FALSE) sim.app <- FALSE # reproduce simulation and application? fig.tab <- FALSE # reproduce figures and tables? ## ----setup,eval=sim.app|fig.tab----------------------------------------------- # path <- "C:/Users/arauschenberger/Desktop/sparselink" # LIH (Windows) # #path <- "/Users/armin.rauschenberger/Desktop/LIH/sparselink" # LCSB (Mac) # # dir <- c("results","manuscript","package/R/functions.R") # for(i in seq_along(dir)){ # if(!dir.exists(file.path(path,dir[i]))&!file.exists(file.path(path,dir[i]))){ # stop(paste0("Require folder/file'",dir[i],"'.")) # } # } # source(file.path(path,"package/R/functions.R")) # Or load 'sparselink' package. # # inst <- rownames(utils::installed.packages()) # pkgs <- c("knitr","rmarkdown","glmnet","BiocManager","mvtnorm","glmtrans","spls","xrnet") # for(i in seq_along(pkgs)){ # if(!pkgs[i]%in%inst){ # utils::install.packages(pkgs[i]) # } # } # pkgs <- c("recount3","edgeR") # for(i in seq_along(pkgs)){ # if(!pkgs[i]%in%inst){ # BiocManager::install(pkgs[i]) # } # } # # blue <- "blue"; red <- "red" # # if(exists("sim.app")&exists("fig.tab")){ # if(!sim.app&fig.tab){ # files <- c("simulation_multiple.RData","simulation_transfer.RData","recount3_data.RData","explore_data.RData","application.RData") # for(i in seq_along(files)){ # if(!file.exists(file.path(path,"results",files[i]))){ # stop("File",files[i],"is missing.") # } # } # } # } ## ----method_figure,eval=fig.tab----------------------------------------------- # #<> # # grDevices::postscript(file=file.path(path,"manuscript","fig_flow.eps"),width=6,height=2.5,horizontal=FALSE,onefile=FALSE,paper="special") # graphics::par(mfrow=c(1,1),mar=c(0,0,0,0)) # graphics::plot.new() # graphics::plot.window(xlim=c(-0.2,1.0),ylim=c(0.0,1.0)) # cex <- 0.8 # # pos <- data.frame(left=0.2,right=0.8,top=0.8,centre=0.45,bottom=0.1) # mar <- data.frame(vertical=0.08,horizontal=0.08,dist=0.04) # # graphics::text(labels=paste("problem",1:2),x=c(pos$left,pos$right),y=pos$top+2*mar$vertical,font=2,col=c(blue,red),cex=cex) # graphics::text(labels=expression(hat(beta)["j,1"]^{init}),x=pos$left,y=pos$top,col=blue) # graphics::text(labels=expression(hat(beta)["j,2"]^{init}),x=pos$right,y=pos$top,col=red) # # graphics::arrows(x0=rep(c(pos$left,pos$right),each=2),x1=rep(c(pos$left,pos$right),times=2)+c(-mar$horizontal,-mar$horizontal,mar$horizontal,mar$horizontal),y0=pos$top-mar$vertical,y1=pos$centre+mar$vertical,length=0.1,col=rep(c(blue,red),each=2),lwd=2) # # graphics::text(labels=expression(w["j,1"]^{int}),x=pos$left-mar$horizontal-mar$dist,y=pos$centre,col=blue) # graphics::text(labels=expression(w["p+j,1"]^{int}),x=pos$left-mar$horizontal+mar$dist,y=pos$centre,col=blue) # graphics::text(labels=expression(w["j,1"]^{ext}),x=pos$left+mar$horizontal-mar$dist,y=pos$centre,col=red) # graphics::text(labels=expression(w["p+j,1"]^{ext}),x=pos$left+mar$horizontal+mar$dist,y=pos$centre,col=red) # # graphics::text(labels=expression(w["j,2"]^{ext}),x=pos$right-mar$horizontal-mar$dist,y=pos$centre,col=blue) # graphics::text(labels=expression(w["p+j,2"]^{ext}),x=pos$right-mar$horizontal+mar$dist,y=pos$centre,col=blue) # graphics::text(labels=expression(w["j,2"]^{int}),x=pos$right+mar$horizontal-mar$dist,y=pos$centre,col=red) # graphics::text(labels=expression(w["p+j,2"]^{int}),x=pos$right+mar$horizontal+mar$dist,y=pos$centre,col=red) # # graphics::arrows(x0=c(pos$left,pos$right),y0=pos$centre-mar$vertical,y1=pos$bottom+mar$vertical,col=c(blue,red),length=0.1,lwd=2) # graphics::text(labels=expression(hat(beta)["j,1"]^{final}==hat(gamma)["j,1"]-hat(gamma)["p+j,1"]),x=pos$left,y=pos$bottom,col=blue) # graphics::text(labels=expression(hat(beta)["j,2"]^{final}==hat(gamma)["j,2"]-hat(gamma)["p+j,2"]),x=pos$right,y=pos$bottom,col=red) # # graphics::text(x=-0.1,y=c(pos$top,pos$bottom),labels=paste("stage",1:2),font=2,cex=cex) # grDevices::dev.off() ## ----simulation,eval=sim.app-------------------------------------------------- # #<> # # repetitions <- 10 # # for(mode in c("transfer","multiple")){ # # grid <- expand.grid(prob.separate=c(0.0,0.025,0.05),prob.common=c(0.0,0.025,0.05),family="gaussian") # grid <- grid[rep(seq_len(nrow(grid)),each=repetitions),] # # grid$seed <- seq_len(nrow(grid)) # grid$family <- as.character(grid$family) # deviance <- auc <- time <- mse.coef <- mse.zero <- mse.nzero <- sel.num <- sel.coef <- sel.count <- hyperpar <- list() # for(i in seq(from=1,to=nrow(grid))){ # set.seed(seed=grid$seed[i]) # cat("i=",i,"\n") # if(mode=="transfer"){ # data <- sim_data_trans(prob.common=grid$prob.common[i],prob.separate=grid$prob.separate[i],family=grid$family[i]) # method <- c("wrap_separate","wrap_glmtrans","sparselink","wrap_xrnet") # } else if(mode=="multiple"){ # #--- multi-task learning --- # data <- sim_data_multi(prob.common=grid$prob.common[i],prob.separate=grid$prob.separate[i],family=grid$family[i]) # method <- c("wrap_separate","wrap_mgaussian","sparselink","wrap_spls") # } # # result <- traintest(y_train=data$y_train,X_train=data$X_train,y_test=data$y_test,X_test=data$X_test,family=grid$family[i],method=method) # hyperpar[[i]] <- result$hyperpar # time[[i]] <- result$time # auc[[i]] <- result$auc # deviance[[i]] <- result$deviance # sel.num[[i]] <- t(sapply(result$coef,function(x) colSums(x!=0))) # sel.count[[i]] <- t(sapply(result$coef,function(x) rowMeans(count_matrix(truth=sign(data$beta),estim=sign(x))))) # Add na.rm=TRUE? # # sel.coef[[i]] <- t(sapply(result$coef,function(x) colMeans(sign(x)!=sign(data$beta)))) # # CONTINUE HERE: consider sparsity, true positives, false negatives, signs # # mse.coef[[i]] <- t(sapply(result$coef,function(x) colMeans((data$beta-x)^2))) # mse.zero[[i]] <- t(sapply(result$coef,function(x) colMeans(((data$beta==0)*(data$beta-x))^2))) # mse.nzero[[i]] <- t(sapply(result$coef,function(x) colMeans(((data$beta!=0)*(data$beta-x))^2))) # } # save(grid,deviance,auc,sel.num,sel.count,sel.coef,mse.coef,mse.zero,mse.nzero,time,file=file.path(path,"results",paste0("simulation_",mode,".RData"))) # } # # writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), sessioninfo::session_info()),con=paste0(path,"/results/info_sim.txt")) ## ----simulation_figure,eval=fig.tab------------------------------------------- # #<> # # caption <- paste(c("\\textbf{Multi-task learning.}","\\textbf{Transfer learning.}"),"Comparison of different measures (rows) between an available method (red) and the proposed method (blue) in different simulation settings (columns), based on the average of three problems",c("(tasks)","(datasets)"),"for each repetition out of ten. Measures: performance metric (mean squared error on hold-out data, as a fraction of the one from standard lasso regression; a point below the dashed line means that",c("multi-task","transfer"),"learning improves predictions), sparsity (number of non-zero coefficients), precision (number of coefficients with correct signs divided by number of non-zero coefficients). The arrows point in the direction of improvement. Settings: percentage of features with a common effect for all problems ($\\pi_\\theta$), percentage of features with a specific effect for each problem ($\\pi_\\delta$).",c("\\label{fig_sim_multiple}","\\label{fig_sim_transfer}")) # # figure_change <- function(model0,model1="sparselink",model2){ # # mode <- paste0(100*grid$prob.common,"%\n",100*grid$prob.separate,"%") # # graphics::par(mfrow=c(3,1),mar=c(3,3,1,1)) # # label <- function(){ # cex <- 0.5 # at <- 0.3 # graphics::mtext(text=expression(pi[theta]==phantom(.)),side=1,line=0.2,at=at,cex=cex) # graphics::mtext(text=expression(pi[delta]==phantom(.)),side=1,line=1.2,at=at,cex=cex) # } # # #--- predictive performance --- # means <- t(sapply(X=deviance,FUN=rowMeans)) # means <- means/means[,"wrap_separate"] # plot_change(x=mode,y0=means[,model0],y1=means[,model1],y2=means[,model2],main="metric",increase=FALSE) # graphics::abline(h=1,lty=2,col="grey") # label() # # #--- sparsity --- # nzero <- sapply(X=sel.num,FUN=rowMeans) # plot_change(x=mode,y0=nzero[model0,],y1=nzero[model1,],y2=nzero[model2,],main="sparsity",increase=FALSE) # # graphics::abline(h=0,lty=2,col="grey") # label() # # #--- precision --- # precision <- sapply(X=sel.count,FUN=function(x) x[,"precision"]) # precision[is.na(precision)] <- 0 # plot_change(x=mode,y0=precision[model0,],y1=precision[model1,],y2=precision[model2,],main="precision",increase=TRUE) # # graphics::abline(h=0,lty=2,col="grey") # label() # # } # # grDevices::postscript(file=file.path(path,"manuscript","fig_sim_multiple.eps"),width=6.5,height=6,horizontal=FALSE,onefile=FALSE,paper="special") # load(file.path(path,paste0("results/simulation_multiple.RData")),verbose=TRUE) # #model.ref <- "wrap_mgaussian" # #model.own <- "sparselink" # figure_change(model0="wrap_mgaussian",model1="sparselink",model2="wrap_spls") # rowMeans(sapply(deviance,function(x) rank(rowMeans(x)))) # rowMeans(sapply(deviance,function(x) colMeans(t(x)/x["wrap_separate",]))) # runtime <- rowSums(sapply(time,function(x) x)) # round(runtime/runtime["wrap_separate"],digits=2) # grDevices::dev.off() # # grDevices::postscript(file=file.path(path,"manuscript","fig_sim_transfer.eps"),width=6.5,height=6,horizontal=FALSE,onefile=FALSE,paper="special") # load(file.path(path,paste0("results/simulation_transfer.RData"))) # #model.ref <- "wrap_glmtrans" # #model.own <- "sparselink" # figure_change(model0="wrap_glmtrans",model1="sparselink",model2="wrap_xrnet") # rowMeans(sapply(deviance,function(x) rank(rowMeans(x)))) # rowMeans(sapply(deviance,function(x) colMeans(t(x)/x["wrap_separate",]))) # runtime <- rowSums(sapply(time,function(x) x)) # round(runtime/runtime["wrap_separate"],digits=2) # grDevices::dev.off() ## ----sim_extra,eval=sim.app--------------------------------------------------- # # Effect of sample size in source or target dataset (TL), effect of sample size (MTL). # #<> # # repetitions <- 50 # grid <- metric <- list() # # for(mode in c("MTL-size","TL-source","TL-target")){ #,"TL-prop","MTL-prop" # metric[[mode]] <- list() # cand <- c(20,40,60,80,100) # if(mode=="MTL-size"){ # grid[[mode]] <- expand.grid(prob.common=0.05,prob.separate=0.025,family="gaussian",n0=cand) # } else if(mode=="TL-source"){ # grid[[mode]] <- expand.grid(prob.common=0.05,prob.separate=0.025,family="gaussian",n_source=cand,n_target=50) # } else if(mode=="TL-target"){ # grid[[mode]] <- expand.grid(prob.common=0.05,prob.separate=0.025,family="gaussian",n_source=50,n_target=cand) # } else if(mode %in% c("TL-prop","MTL-prop")){ # cand <- c(0.025,0.05,0.10,0.15,0.20) # grid[[mode]] <- expand.grid(prob.common=cand,prob.separate=NA,family="gaussian",n_source=50,n_target=50,n0=50) # } else { # stop("Wrong mode.") # } # grid[[mode]] <- grid[[mode]][rep(seq_len(nrow(grid[[mode]])),each=repetitions),] # #grid[[mode]]$seed <- seq_len(nrow(grid[[mode]])) # grid[[mode]]$seed <- rep(x=seq_len(repetitions),times=length(cand)) # grid[[mode]]$family <- as.character(grid[[mode]]$family) # cond <- is.na(grid[[mode]]$prob.separate) # grid[[mode]]$prob.separate[cond] <- 0.5*grid[[mode]]$prob.common[cond] # # for(i in seq(from=1,to=nrow(grid[[mode]]))){ # set.seed(seed=grid$seed[i]) # cat("i=",i,"\n") # if(mode %in% c("TL-source","TL-target","TL-prop")){ # n0 <- rep(c(grid[[mode]]$n_source[i],grid[[mode]]$n_target[i]),times=c(2,1)) # data <- sim_data_trans(prob.common=grid[[mode]]$prob.common[i],prob.separate=grid[[mode]]$prob.separate[i],family=grid[[mode]]$family[i],n0=n0) # method <- c("wrap_separate","wrap_glmtrans","sparselink","wrap_xrnet") # } else if(mode %in% c("MTL-size","MTL-prop")){ # data <- sim_data_multi(prob.common=grid[[mode]]$prob.common[i],prob.separate=grid[[mode]]$prob.separate[i],family=grid[[mode]]$family[i],n0=grid[[mode]]$n0[i]) # method <- c("wrap_separate","wrap_mgaussian","sparselink","wrap_spls") # } else { # stop("Wrong mode.") # } # result <- traintest(y_train=data$y_train,X_train=data$X_train,y_test=data$y_test,X_test=data$X_test,family=grid[[mode]]$family[i],method=method) # metric[[mode]][[i]] <- result$deviance # } # } # # save(grid,metric,file=file.path(path,"results","simulation_devel.RData")) # # writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), sessioninfo::session_info()),con=paste0(path,"/results/info_sim_extra.txt")) ## ----sim_extra_figure,eval=fig.tab-------------------------------------------- # #<> # load(file.path(path,"results","simulation_devel.RData")) # # grDevices::postscript(file=file.path(path,"manuscript","fig_sim_extra.eps"),width=6.5,height=3,horizontal=FALSE,onefile=FALSE,paper="special") # cex <- 0.8 # graphics::par(mfrow=c(1,3),mar=c(4.5,4.5,1.5,1),oma=c(0,0,0,0)) # #graphics::layout(mat=matrix(data=c(1,1,2,2,3,3,0,4,4,0,5,5),ncol=2)) # for(mode in c("MTL-size","TL-source","TL-target")){ # #for(mode in c("MTL-prop","TL-prop")){ # if(mode %in% c("MTL-size","MTL-prop","TL-prop")){ # mse <- sapply(metric[[mode]],function(x) rowMeans(x)) # } else if(mode %in% c("TL-source","TL-target")){ # mse <- sapply(metric[[mode]],function(x) x[,3]) # } # if(mode %in% c("TL-source","TL-target","TL-prop")){ # col <- c("wrap_separate"="black","wrap_glmtrans"="red","wrap_xrnet"="orange","sparselink"="blue") # lty <- c("wrap_separate"=3,"wrap_glmtrans"=2,"wrap_xrnet"=2,"sparselink"=1) # } else if(mode %in% c("MTL-size","MTL-prop")) { # col <- c("wrap_separate"="black","wrap_mgaussian"="red","wrap_spls"="orange","sparselink"="blue") # lty <- c("wrap_separate"=3,"wrap_mgaussian"=2,"wrap_spls"=2,"sparselink"=1) # } # if(mode=="TL-source"){ # params <- grid[[mode]]$n_source # } else if(mode=="TL-target"){ # params <- grid[[mode]]$n_target # } else if(mode=="MTL-size"){ # params <- grid[[mode]]$n0 # } else if(mode %in% c("TL-prop","MTL-prop")){ # params <- grid[[mode]]$prob.common # } # unique <- unique(params) # graphics::plot.new() # graphics::plot.window(xlim=range(params),ylim=range(log(mse))) # graphics::box() # if(mode=="MTL-size"){ # main <- "MTL - varying sample size" # xlab <- bquote("sample size ("~n[1]~"="~n[2]~"="~n[3]~")") # legend <- "" # } else if(mode=="TL-source"){ # main <- "TL - varying source sample size" # xlab <- bquote("source sample size ("~n[1]~"="~n[2]~")") # legend <- bquote("target sample size:"~n[3]==.(unique(grid[[mode]]$n_target))) # } else if(mode=="TL-target"){ # main <- "TL - varying target sample size" # xlab <- bquote("target sample size ("~n[3]~")") # legend <- bquote("source sample size:"~n[1]~"="~n[2]==.(unique(grid[[mode]]$n_source))) # } else if(mode=="MTL-prop"){ # xlab <- "blabla" # main <- "MTL - effect proportion" # legend <- "" # } else if(mode=="TL-prop"){ # xlab <- "blabla" # main <- "TL - effect proportion" # legend <- "" # } # graphics::title(main=main,cex.main=cex) # graphics::title(ylab="log MSE",line=2.5,xlab=xlab,cex.lab=cex) # graphics::legend(x="topleft",legend=legend,bty="n",cex=cex) # if(mode %in% c("TL-prop","MTL-prop")){ # graphics::axis(side=1,at=unique,labels=paste0(100*unique,"%"),cex.axis=cex) # } else { # graphics::axis(side=1,at=unique,cex.axis=cex) # } # graphics::axis(side=2,cex.axis=cex) # # for(i in names(col)){ # val <- tapply(X=mse[i,],INDEX=params,FUN=function(x) mean(x)) # graphics::lines(x=unique,y=log(val),col=col[i],type="o",pch=16,lty=lty[i]) # } # } # grDevices::dev.off() ## ----define_projects,eval=fig.tab--------------------------------------------- # project <- list() # project$IBD <- c("Tew (2016)"="SRP063496", # "Haberman (2019)"="SRP129004", # "Verstockt (2019)"="ERP113396", # "Verstockt (2020)"="ERP114636", # "Boyd (2018)"="SRP100787") # project$RA <- c("Baker (2019)"="SRP169062", # "Moncrieffe (2017)"="SRP074736", # "Goldberg (2018)"="SRP155483") # extra <- c("Lewis (2019)"="ERP104864") # https://doi.org/10.1016/j.celrep.2019.07.091 ## ----download_data,eval=sim.app----------------------------------------------- # #<> # #<> # # data <- list() # for(i in c(unlist(project),extra)){ # data[[i]] <- recount3::create_rse_manual( # project=i, # project_home="data_sources/sra", # organism="human", # annotation = "gencode_v26", # type="gene") # } # save(data,file=file.path(path,"results/recount3_data.RData")) # # writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), # sessioninfo::session_info()),con=paste0(path,"/results/info_data.txt")) ## ----preprocess_data,eval=sim.app--------------------------------------------- # #<> # #<> # # load(file.path(path,"results/recount3_data.RData")) # # #- - - - - - - - - - - - - - - # #- - - extract features - - - # #- - - - - - - - - - - - - - - # # # extract features # x <- list() # for(i in c(unlist(project),extra)){ # counts <- t(SummarizedExperiment::assays(data[[i]])$raw_counts) # colnames(counts) <- SummarizedExperiment::rowRanges(data[[i]])$gene_name # x[[i]] <- counts # } # # # select most expressed protein-coding genes (for all TL projects together) # select <- list() # total <- numeric() # for(i in unlist(project)){ # #total <- rbind(total,Matrix::colSums(x[[i]])) # original: mean filtering # total <- rbind(total,apply(X=x[[i]],MARGIN=2,FUN=stats::var)) # trial: variance filtering # } # type <- SummarizedExperiment::rowData(data[[i]])$gene_type # cond <- type=="protein_coding" # total[,!cond] <- 0 # rank <- apply(X=total,MARGIN=1,FUN=rank) # mean_rank <- rowMeans(rank) # #temp <- cond & apply(total,2,function(x) all(x>0)) & (mean_rank >= sort(mean_rank[cond],decreasing=TRUE)[2000]) # original: top 2000 # temp <- cond & mean_rank >= sort(mean_rank[cond],decreasing=TRUE)[5000] # trial: top 5000 # # for(i in unlist(project)){ # select[[i]] <- temp # } # # # select most expressed protein-coding genes (for MTL project) # #mean <- apply(X=x[[extra]],MARGIN=2,FUN=mean) # original # var <- apply(X=x[[extra]],MARGIN=2,FUN=var) # trial # #warning("change number in next line") # #temp <- cond & mean >= sort(mean[cond],decreasing=TRUE)[5000] # trial: top 5000 # temp <- cond & var >= sort(var[cond],decreasing=TRUE)[5000] # trial: top 5000 # select[[extra]] <- temp # # # pre-processing # for(i in c(unlist(project),extra)){ # lib.size <- Matrix::rowSums(x[[i]]) # x[[i]] <- x[[i]][,select[[i]],drop=FALSE] # norm.factors <- edgeR::calcNormFactors(object=t(x[[i]]),lib.size=lib.size) # gamma <- norm.factors*lib.size/mean(lib.size) # gamma <- matrix(data=gamma,nrow=nrow(x[[i]]),ncol=ncol(x[[i]])) # x[[i]] <- x[[i]]/gamma # x[[i]] <- 2*sqrt(x[[i]] + 3/8) # Anscombe transform # x[[i]] <- scale(x[[i]]) # scale because of different datasets!? # } # # #- - - - - - - - - - - - - - # #- - - extract targets - - - # #- - - - - - - - - - - - - - # # # extract information on samples # frame <- list() # for(i in c(unlist(project),extra)){ # list <- strsplit(data[[i]]$sra.sample_attributes,split="\\|") # data[[i]]$sra.experiment_attributes # # What about sra.experiment_attributes? # n <- length(list) # cols <- unique(sapply(strsplit(unlist(list),split=";;"),function(x) x[1])) # ncol <- length(cols) # frame[[i]] <- matrix(data=NA,nrow=n,ncol=ncol,dimnames=list(rownames(x[[i]]),cols)) # for(j in seq_len(n)){ # for(k in seq_len(ncol)){ # vector <- list[[j]] # which <- which(substring(text=vector,first=1,last=nchar(cols[k]))==cols[k]) # string <- vector[which] # if(length(string)==0){next} # frame[[i]][j,k] <- strsplit(string,split=";;")[[1]][2] # } # } # frame[[i]] <- as.data.frame(frame[[i]]) # } # # # extract binary outcome # y <- z <- list() # for(i in unlist(project)){ # # CONTINUE HERE!!! # if(i=="ERP113396"){ # y[[i]] <- sapply(X=frame[[i]]$`clinical history`,FUN=function(x) switch(EXPR=x,"responder"=1,"non-responder"=0,stop("invalid"))) # } else if(i=="ERP114636"){ # y[[i]] <- sapply(X=frame[[i]]$`clinical information`,FUN=function(x) switch(EXPR=x,"response to vedolizumab therapy"=1-1,"no response to vedolizumab therapy"=0+1,stop("invalid"))) # warning("Inverting response and non-response!") # } else if(i=="SRP100787"){ # y[[i]] <- sapply(X=frame[[i]]$condition,FUN=function(x) switch(EXPR=x,"CD inactive"=1,"UC inactive"=1,"CD active"=0,"UC active"=0,control=NA,"NA"=NA,stop("invalid"))) # } else if(i=="SRP129004"){ # y[[i]] <- sapply(X=frame[[i]]$`week 4 remission`,FUN=function(x) switch(EXP=x,"Yes"=1,"No"=0,"NA"=NA,stop("invalid"))) # suppressWarnings(z[[i]] <- data.frame(pucai=as.numeric(frame[[i]]$pucai),mayo=as.numeric(frame[[i]]$`total mayo score`),histology=as.numeric(frame[[i]]$`histology severity score`))) # } else if(i=="SRP063496"){ # y[[i]] <- sapply(X=frame[[i]]$`remission at week 10`,FUN=function(x) switch(x, "Remitter"=1,"Non-remitter"=0,"N/A"=NA,stop("invalid"))) # } else if(i=="SRP169062"){ # y[[i]] <- sapply(X=frame[[i]]$`flare event`,FUN=function(x) switch(x,"no flare"=1,"flare"=0,stop("invalid"))) # } else if(i=="SRP155483"){ # y[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=1,"Low"=0,"Moderate"=0,"High"=0,"--"=NA,stop("invalid"))) # z[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=0,"Low"=1,"Moderate"=2,"High"=3,"--"=NA,stop("invalid"))) # } else if(i=="SRP074736"){ # y[[i]] <- sapply(X=frame[[i]]$`mtx response status`,FUN=function(x) switch(x,"responder"=1,"non-responder"=0,"control"=NA,stop("invalid"))) # } # } # # # overlap # for(j in unlist(project)){ # is.na <- is.na(y[[j]]) # if(length(is.na)!=nrow(x[[j]])){stop()} # y[[j]] <- y[[j]][!is.na] # if(!is.null(z[[j]])){ # if(is.vector(z[[j]])){ # z[[j]] <- z[[j]][!is.na] # } else { # z[[j]] <- z[[j]][!is.na,] # } # } # x[[j]] <- x[[j]][!is.na,] # } ## ----explore_apply,eval=sim.app----------------------------------------------- # #<> # #<> # #<> # # set.seed(1) # alpha.holdout <- 0 # alpha.crossval <- 1 # family <- "binomial" # nfolds <- 10 # codes <- unlist(project) # coef <- matrix(data=NA,nrow=ncol(x[[1]]),ncol=length(codes),dimnames=list(NULL,codes)) # auc <- auc.pvalue <- matrix(data=NA,nrow=length(codes),ncol=length(codes),dimnames=list(codes,codes)) # foldid <- make_folds_trans(y=y,family="binomial",nfolds=nfolds) # # ridge <- lasso <- list() # for(i in seq_along(codes)){ # ridge[[i]] <- glmnet::cv.glmnet(x=x[[codes[i]]],y=y[[codes[i]]],family=family,alpha=alpha.holdout,foldid=foldid[[i]]) # coef[,i] <- stats::coef(ridge[[i]],s="lambda.min")[-1] # for(j in seq_along(codes)){ # if(i==j){ # y_hat <- rep(x=NA,times=length(y[[i]])) # for(k in seq_len(nfolds)){ # holdout <- foldid[[i]]==k # temp <- glmnet::cv.glmnet(x=x[[codes[i]]][!holdout,],y=y[[codes[i]]][!holdout],family=family,alpha=alpha.crossval) # y_hat[holdout] <- predict(object=temp,newx=x[[codes[i]]][holdout,],s="lambda.min",type="response") # } # } else { # y_hat <- as.numeric(predict(object=ridge[[i]],newx=x[[j]],s="lambda.min",type="response")) # } # auc[i,j] <- pROC::auc(response=y[[codes[j]]],predictor=y_hat,direction="<",levels=c(0,1)) # auc.pvalue[i,j] <- stats::wilcox.test(rank(y_hat)~y[[codes[[j]]]],alternative="less",exact=FALSE)$p.value # } # } # # save(coef,auc,auc.pvalue,codes,file=file.path(path,"results","explore_data.RData")) # # writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), # sessioninfo::session_info()),con=paste0(path,"/results/info_explore.txt")) ## ----explore_tables,eval=fig.tab,results="asis"------------------------------- # #<> # #if(any(unlist(project)!=names(refs))){stop("not compatible")} # # load(file.path(path,"results/explore_data.RData")) # names <- gsub(pattern="IBD.|RA.",replacement="",x=names(unlist(project))) # codes <- colnames(coef) # cor.pvalue <- matrix(data=NA,nrow=length(codes),ncol=length(codes),dimnames=list(codes,codes)) # for(i in seq_along(codes)){ # for(j in seq_along(codes)){ # cor.pvalue[i,j] <- stats::cor.test(x=coef[,i],y=coef[,j],method="spearman",exact=FALSE)$p.value # } # } # diag(cor.pvalue) <- NA # # insert.space <- function(table,cut){ # index.left <- index.top <- seq_len(cut) # index.right <- index.bottom <- seq(from=cut+1,to=ncol(table)) # top <- cbind(table[index.top,index.left],"",table[index.top,index.right]) # bottom <- cbind(table[index.bottom,index.left],"",table[index.bottom,index.right]) # out <- rbind(top,"",bottom) # colnames(out)[colnames(out)==""] <- " " # return(out) # } # # table <- stats::cor(coef,method="spearman") # rownames(table) <- colnames(table) <- names # black <- (!is.na(cor.pvalue)) & (cor.pvalue<=0.05) # star <- (!is.na(cor.pvalue)) & (cor.pvalue<=0.05/choose(n=length(codes),k=2)) # nonnegative <- table>=0 # table <- format(round(table,digits=2),digits=2,trim=TRUE) # table[nonnegative] <- paste0("\\phantom{-}",table[nonnegative]) # table[!black] <- paste0("\\textcolor{gray}{",table[!black],"}") # table[star] <- paste0(table[star],"$^\\star$") # table[!star] <- paste0(table[!star],"\\phantom{$^\\star$}") # #table[nonnegative] <- paste0("-",table[nonnegative]) # diag(table) <- "-" # table <- insert.space(table=table,cut=5) # xtable <- xtable::xtable(x=table,align="rccccccccc",caption="Spearman correlation coefficients between the ridge regression coefficients from different datasets. Pairwise combinations of datasets with significantly correlated regression coefficients are highlighted, with black colour for nominal significance ($p$-value $\\leq 0.05$) and stars for adjusted significance ($p$-value $\\leq 0.05/28$). We expect a correlation coefficient close to $0$ for unrelated problems and close to $1$ for identical problems.",label="tab_cor") # xtable::print.xtable(x=xtable,sanitize.text.function=identity,rotate.colnames=TRUE,caption.placement="top",hline.after=c(0,nrow(table)),comment=FALSE,file=file.path(path,"manuscript","tab_cor.tex"),floating.environment="table*") #add.to.row=list(pos=list(5),command="\\hdashline \n") # # table <- auc # rownames(table) <- colnames(table) <- names # table <- format(round(table,digits=2),digits=2) # black <- auc.pvalue<=0.05 # star <- auc.pvalue<=0.05/(length(codes)*length(codes)) # diag(table) <- paste0("(",diag(table),")") # table[!black] <- paste0("\\textcolor{gray}{",table[!black],"}") # table[star] <- paste0(table[star],"$^\\star$") # table[!star] <- paste0(table[!star],"\\phantom{$^\\star$}") # table <- insert.space(table=table,cut=5) # xtable <- xtable::xtable(x=table,align="rccccccccc",caption="Out-of-sample area under the receiver operating characteristic curve (\\textsc{roc-auc}) from logistic ridge regression trained on the dataset in the row and tested on the dataset in the column (off-diagonal entries), or cross-validated \\textsc{roc-auc} from logistic lasso regression trained and tested on the same dataset by $10$-fold external cross-validation (diagonal entries, between brackets). The \\textsc{roc-auc} of a random classifier is $0.5$, while that of a perfect classifier is $1.0$. Entries on and off the diagonal are not comparable. Predictions that are significantly better than random predictions (according to the one-sided Mann-Whitney $U$ test for testing whether the ranks of the predicted probabilities are significantly higher for the cases than for the controls) are highlighted, with black colour for nominal significance ($p$-value $\\leq 0.05$) and stars for adjusted significance ($p$-value $\\leq 0.05/64$).",label="tab_auc") # xtable::print.xtable(x=xtable,sanitize.text.function=identity,rotate.colnames=TRUE,caption.placement="top",hline.after=c(0,nrow(table)),comment=FALSE,file=file.path(path,"manuscript","tab_auc.tex"),floating.environment="table*") ## ----transfer_apply,eval=sim.app---------------------------------------------- # #<> # #<> # #<> # # result <- list() # for(i in names(project)){ # cat("project:",i,"\n") # result[[i]] <- list() # for(j in seq_len(5)){ # 5 repetitions of 10-fold CV # set.seed(j) # codes <- project[[i]] # result[[i]][[j]] <- cv_transfer(y=y[codes],X=x[codes],family="binomial",method=c("wrap_separate","wrap_glmtrans","sparselink","wrap_xrnet"),alpha.init=ifelse(i=="RA",0,0.95)) # lasso-like elastic net for IBD, ridge for RA (weak signal) # } # } # save(result,project,file=file.path(path,"results","application.RData")) # # writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), # sessioninfo::session_info()),con=paste0(path,"/results/info_app.txt")) # # # debugging # if(FALSE){ # set.seed(1) # codes <- project[["RA"]] # test <- wrap_xrnet(x=x[codes],y=y[codes],alpha.init=0.95,alpha=1) # # } ## ----transfer_figure,eval=fig.tab--------------------------------------------- # #<> # # grDevices::postscript(file=file.path(path,"manuscript","fig_app.eps"),width=6.5,height=4,horizontal=FALSE,onefile=FALSE,paper="special") # graphics::par(mfrow=c(2,1),mar=c(4,2,1,1),oma=c(0,0,0,0)) # load(file.path(path,paste0("results/application.RData")),verbose=TRUE) # # model0 <- "wrap_glmtrans" # model1 <- "sparselink" # model2 <- "wrap_xrnet" # # # predictivity # metric <- lapply(result,function(x) do.call(what="rbind",args=lapply(x,function(x) x$auc))) # DEV and AUC need different directions (increase=FALSE/TRUE)! # metric <- do.call(what="rbind",args=metric) # metric <- metric/metric[,"wrap_separate"] # #xlab <- refs[rownames(metric)] # #names <- gsub(pattern="",replacement="\n",x=unlist(project)) # # label <- gsub(pattern="IBD.|RA.",replacement="",x=gsub(pattern=" ",replacement="\n",x=names(unlist(project)))) # index <- match(x=rownames(metric),table=unlist(project)) # # xlab <- label[index] # plot_change(x=xlab,y0=metric[,model0],y1=metric[,model1],y2=metric[,model2],main="metric",increase=TRUE,cex.main=0.8) # graphics::axis(side=1,at=length(project$IBD)+0.5,labels="|",tick=FALSE,line=-0.25,font=2) # graphics::abline(h=0.5,lty=2,col="grey") # graphics::abline(h=1,lty=2,col="grey") # # # sparsity # nzero <- lapply(result,function(x) lapply(x,function(x) sapply(x$refit$coef,function(x) colSums(x!=0)))) # nzero <- do.call(what="rbind",args=do.call(what="c",args=nzero)) # plot_change(x=xlab,y0=nzero[,model0],y1=nzero[,model1],y2=nzero[,model2],main="sparsity",increase=FALSE,cex.main=0.8) # graphics::axis(side=1,at=length(project$IBD)+0.5,labels="|",tick=FALSE,line=-0.25,font=2) # graphics::abline(h=0,lty=2,col="grey") # grDevices::dev.off() # # # percentage change # # (reported in section 4 "application" subsection 4.3 "transfer learning") # # disease <- ifelse(rownames(metric) %in% project$IBD,"IBD",ifelse(rownames(metric) %in% project$RA,"RA",NA)) # # #round(100*colMeans(metric)-100,digits=2) # round(100*colMeans(metric[disease=="IBD",])-100,digits=2) # round(100*colMeans(metric[disease=="RA",])-100,digits=2) # # colMeans(nzero) # colMeans(nzero[disease=="IBD",]) # colMeans(nzero[disease=="RA",]) # # #--- revision: report AUC --- # Reduce(f="+",x=lapply(result$IBD,function(x) x$auc))/length(result$IBD) # lapply(result,function(x) round(colMeans(Reduce(f="+",x=lapply(x,function(x) x$auc))/length(result$IBD)),digits=2)) ## ----interpret_models,eval=fig.tab-------------------------------------------- # #<> # # load(file.path(path,"results","application.RData")) # # coefs <- list() # for(i in seq_along(result$IBD)){ # coefs[[i]] <- result$IBD[[i]]$refit$coef$sparselink # colnames(coefs[[i]]) <- names(project$IBD) # rownames(coefs[[i]]) <- rownames(result$IBD[[1]]$refit$coef$wrap_glmtrans) # try to avoid this # } # # any <- rowSums(sapply(coefs,function(x) apply(x,1,function(x) any(x!=0))))!=0 # for(i in seq_along(result$IBD)){ # coefs[[i]] <- coefs[[i]][any,] # } # table <- Reduce(f="+",x=coefs)/5 # # cex <- 0.7 # # grDevices::postscript(file=file.path(path,"manuscript","fig_coef.eps"),width=7,height=4,horizontal=FALSE,onefile=FALSE,paper="special") # graphics::par(mfrow=c(1,1),mar=c(2.5,4.5,0.5,1.5),oma=c(0,0,0,0)) # graphics::plot.new() # graphics::plot.window(xlim=c(0.6,ncol(table)+0.4),ylim=c(0.5,nrow(table)+0.5)) # col <- apply(table,1,function(x) ifelse(all(x<=0),"blue",ifelse(all(x>=0),"red","black"))) # colnames <- gsub(x=colnames(table),pattern=" ",replacement="\n") # graphics::mtext(text=colnames,side=1,at=seq_len(ncol(table)),cex=cex,line=1) # rownames <- rownames(table) # graphics::mtext(text=rownames,side=2,at=seq_len(nrow(table)),las=2,cex=cex,line=0.7,col=col) # star <- rowSums(table!=0)>1 # graphics::mtext(text="*",side=2,at=which(star),line=-0.3) # graphics::mtext(text=ifelse(col=="blue","-",ifelse(col=="red","+",".")),side=4,at=seq_len(nrow(table)),las=2,cex=cex,line=0.5,col=col) # for(i in seq_len(nrow(table))){ # for(j in seq_len(ncol(table))){ # for(k in 1:5){ # col <- ifelse(coefs[[k]][i,j]<0,"blue",ifelse(coefs[[k]][i,j]>0,"red","white")) # cex <- pmax(sqrt(5*abs(coefs[[k]][i,j])),0.2) # graphics::points(x=j-(-3+k)*0.17,y=i,col=col,cex=cex,pch=16) # } # } # } # graphics::abline(v=seq(from=0.5,to=5.5,by=1)) # grDevices::dev.off() ## ----eval=FALSE--------------------------------------------------------------- # path <- "C:/Users/arauschenberger/Desktop/sparselink" # LIH (Windows) # #path <- "/Users/armin.rauschenberger/Desktop/LIH/sparselink" # LCSB (Mac) # # dir <- c("results","manuscript","package/R/functions.R") # for(i in seq_along(dir)){ # if(!dir.exists(file.path(path,dir[i]))&!file.exists(file.path(path,dir[i]))){ # stop(paste0("Require folder/file'",dir[i],"'.")) # } # } # source(file.path(path,"package/R/functions.R")) # Or load 'sparselink' package. # # inst <- rownames(utils::installed.packages()) # pkgs <- c("knitr","rmarkdown","glmnet","BiocManager","mvtnorm","glmtrans","spls","xrnet") # for(i in seq_along(pkgs)){ # if(!pkgs[i]%in%inst){ # utils::install.packages(pkgs[i]) # } # } # pkgs <- c("recount3","edgeR") # for(i in seq_along(pkgs)){ # if(!pkgs[i]%in%inst){ # BiocManager::install(pkgs[i]) # } # } # # blue <- "blue"; red <- "red" # # if(exists("sim.app")&exists("fig.tab")){ # if(!sim.app&fig.tab){ # files <- c("simulation_multiple.RData","simulation_transfer.RData","recount3_data.RData","explore_data.RData","application.RData") # for(i in seq_along(files)){ # if(!file.exists(file.path(path,"results",files[i]))){ # stop("File",files[i],"is missing.") # } # } # } # } # project <- list() # project$IBD <- c("Tew (2016)"="SRP063496", # "Haberman (2019)"="SRP129004", # "Verstockt (2019)"="ERP113396", # "Verstockt (2020)"="ERP114636", # "Boyd (2018)"="SRP100787") # project$RA <- c("Baker (2019)"="SRP169062", # "Moncrieffe (2017)"="SRP074736", # "Goldberg (2018)"="SRP155483") # extra <- c("Lewis (2019)"="ERP104864") # https://doi.org/10.1016/j.celrep.2019.07.091 # #<> # #<> # # load(file.path(path,"results/recount3_data.RData")) # # #- - - - - - - - - - - - - - - # #- - - extract features - - - # #- - - - - - - - - - - - - - - # # # extract features # x <- list() # for(i in c(unlist(project),extra)){ # counts <- t(SummarizedExperiment::assays(data[[i]])$raw_counts) # colnames(counts) <- SummarizedExperiment::rowRanges(data[[i]])$gene_name # x[[i]] <- counts # } # # # select most expressed protein-coding genes (for all TL projects together) # select <- list() # total <- numeric() # for(i in unlist(project)){ # #total <- rbind(total,Matrix::colSums(x[[i]])) # original: mean filtering # total <- rbind(total,apply(X=x[[i]],MARGIN=2,FUN=stats::var)) # trial: variance filtering # } # type <- SummarizedExperiment::rowData(data[[i]])$gene_type # cond <- type=="protein_coding" # total[,!cond] <- 0 # rank <- apply(X=total,MARGIN=1,FUN=rank) # mean_rank <- rowMeans(rank) # #temp <- cond & apply(total,2,function(x) all(x>0)) & (mean_rank >= sort(mean_rank[cond],decreasing=TRUE)[2000]) # original: top 2000 # temp <- cond & mean_rank >= sort(mean_rank[cond],decreasing=TRUE)[5000] # trial: top 5000 # # for(i in unlist(project)){ # select[[i]] <- temp # } # # # select most expressed protein-coding genes (for MTL project) # #mean <- apply(X=x[[extra]],MARGIN=2,FUN=mean) # original # var <- apply(X=x[[extra]],MARGIN=2,FUN=var) # trial # #warning("change number in next line") # #temp <- cond & mean >= sort(mean[cond],decreasing=TRUE)[5000] # trial: top 5000 # temp <- cond & var >= sort(var[cond],decreasing=TRUE)[5000] # trial: top 5000 # select[[extra]] <- temp # # # pre-processing # for(i in c(unlist(project),extra)){ # lib.size <- Matrix::rowSums(x[[i]]) # x[[i]] <- x[[i]][,select[[i]],drop=FALSE] # norm.factors <- edgeR::calcNormFactors(object=t(x[[i]]),lib.size=lib.size) # gamma <- norm.factors*lib.size/mean(lib.size) # gamma <- matrix(data=gamma,nrow=nrow(x[[i]]),ncol=ncol(x[[i]])) # x[[i]] <- x[[i]]/gamma # x[[i]] <- 2*sqrt(x[[i]] + 3/8) # Anscombe transform # x[[i]] <- scale(x[[i]]) # scale because of different datasets!? # } # # #- - - - - - - - - - - - - - # #- - - extract targets - - - # #- - - - - - - - - - - - - - # # # extract information on samples # frame <- list() # for(i in c(unlist(project),extra)){ # list <- strsplit(data[[i]]$sra.sample_attributes,split="\\|") # data[[i]]$sra.experiment_attributes # # What about sra.experiment_attributes? # n <- length(list) # cols <- unique(sapply(strsplit(unlist(list),split=";;"),function(x) x[1])) # ncol <- length(cols) # frame[[i]] <- matrix(data=NA,nrow=n,ncol=ncol,dimnames=list(rownames(x[[i]]),cols)) # for(j in seq_len(n)){ # for(k in seq_len(ncol)){ # vector <- list[[j]] # which <- which(substring(text=vector,first=1,last=nchar(cols[k]))==cols[k]) # string <- vector[which] # if(length(string)==0){next} # frame[[i]][j,k] <- strsplit(string,split=";;")[[1]][2] # } # } # frame[[i]] <- as.data.frame(frame[[i]]) # } # # # extract binary outcome # y <- z <- list() # for(i in unlist(project)){ # # CONTINUE HERE!!! # if(i=="ERP113396"){ # y[[i]] <- sapply(X=frame[[i]]$`clinical history`,FUN=function(x) switch(EXPR=x,"responder"=1,"non-responder"=0,stop("invalid"))) # } else if(i=="ERP114636"){ # y[[i]] <- sapply(X=frame[[i]]$`clinical information`,FUN=function(x) switch(EXPR=x,"response to vedolizumab therapy"=1-1,"no response to vedolizumab therapy"=0+1,stop("invalid"))) # warning("Inverting response and non-response!") # } else if(i=="SRP100787"){ # y[[i]] <- sapply(X=frame[[i]]$condition,FUN=function(x) switch(EXPR=x,"CD inactive"=1,"UC inactive"=1,"CD active"=0,"UC active"=0,control=NA,"NA"=NA,stop("invalid"))) # } else if(i=="SRP129004"){ # y[[i]] <- sapply(X=frame[[i]]$`week 4 remission`,FUN=function(x) switch(EXP=x,"Yes"=1,"No"=0,"NA"=NA,stop("invalid"))) # suppressWarnings(z[[i]] <- data.frame(pucai=as.numeric(frame[[i]]$pucai),mayo=as.numeric(frame[[i]]$`total mayo score`),histology=as.numeric(frame[[i]]$`histology severity score`))) # } else if(i=="SRP063496"){ # y[[i]] <- sapply(X=frame[[i]]$`remission at week 10`,FUN=function(x) switch(x, "Remitter"=1,"Non-remitter"=0,"N/A"=NA,stop("invalid"))) # } else if(i=="SRP169062"){ # y[[i]] <- sapply(X=frame[[i]]$`flare event`,FUN=function(x) switch(x,"no flare"=1,"flare"=0,stop("invalid"))) # } else if(i=="SRP155483"){ # y[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=1,"Low"=0,"Moderate"=0,"High"=0,"--"=NA,stop("invalid"))) # z[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=0,"Low"=1,"Moderate"=2,"High"=3,"--"=NA,stop("invalid"))) # } else if(i=="SRP074736"){ # y[[i]] <- sapply(X=frame[[i]]$`mtx response status`,FUN=function(x) switch(x,"responder"=1,"non-responder"=0,"control"=NA,stop("invalid"))) # } # } # # # overlap # for(j in unlist(project)){ # is.na <- is.na(y[[j]]) # if(length(is.na)!=nrow(x[[j]])){stop()} # y[[j]] <- y[[j]][!is.na] # if(!is.null(z[[j]])){ # if(is.vector(z[[j]])){ # z[[j]] <- z[[j]][!is.na] # } else { # z[[j]] <- z[[j]][!is.na,] # } # } # x[[j]] <- x[[j]][!is.na,] # } # source(file.path(path,"package/R/development.R")) # # x <- x$SRP100787 # y <- y$SRP100787 # sd <- apply(x,2,sd) # temporary # REMOVE THIS! # x <- x[,sd>=sort(sd,decreasing=TRUE)[100]] # REMOVE THIS! # iter <- 10 # q <- 2 # n <- nrow(x) # # #-- -- --- --- # #--- MTL --- # #-- -- --- --- # # prob_mtl <- c(0.00,0.05,0.10,0.15,0.20) # auc_mtl <- list() # for(i in seq_along(prob_mtl)){ # auc_mtl[[i]] <- list() # for(j in seq_len(iter)){ # Y <- matrix(data=NA,nrow=n,ncol=q) # for(k in seq_len(q)){ # Y[,k] <- abs(y - stats::rbinom(n=n,size=1,prob=prob_mtl[i])) # } # #auc_mtl[[i]][[j]] <- cv_multiple(y=Y,X=x,method=c("wrap_separate","sparselink"),family="binomial",alpha.init=0.95,alpha=1,type="exp",trial=FALSE)$auc # auc_mtl[[i]][[j]] <- joinet::cv.joinet(Y=Y,X=x,family="binomial") # } # } # # MTL is beneficial under prob=0.15 # # # for joinet (deviance: lower=better) # a <- sapply(auc_mtl,function(x) rowMeans(sapply(x,function(x) rowMeans(x)))) # plot(prob_mtl,y=a["base",],type="o") # lines(prob_mtl,y=a["meta",],type="o",lty=2) # # # #lapply(auc_mtl,function(x) rowMeans(sapply(x,function(x) colMeans(x)))) # # #--- --- --- --- # #--- TL --- # #--- --- --- --- # # prob_tl <- c(0.3,0.4,0.5,0.6,0.7) # auc_tl <- list() # for(i in seq_along(prob_tl)){ # auc_tl[[i]] <- list() # cond <- rep(x=NA,times=n) # Y <- X <- list() # for(j in seq_len(iter)){ # n0 <- round(prob_tl[i]*sum(y==0)) # n1 <- round(prob_tl[i]*sum(y==1)) # cond[y==0] <- sample(rep(x=c(0,1),times=c(n0,sum(y==0)-n0))) #sample(rep(x=c(0,1),length.out=sum(y==0))) # cond[y==1] <- sample(rep(x=c(0,1),times=c(n1,sum(y==1)-n1))) #sample(rep(x=c(0,1),length.out=sum(y==1))) # Y <- list(y1=y[cond==0],y2=y[cond==1]) # X <- list(x1=x[cond==0,],x2=x[cond==1,]) # auc_tl[[i]][[j]] <- cv_transfer(y=Y,X=X,method=c("wrap_separate","sparselink"),family="binomial",alpha.init=0.95,alpha=1,type="exp",trial=FALSE)$auc # } # } # # save(prob_mtl,auc_mtl,prob_tl,auc_tl,file=file.path(path,"results","sim-devel.RData")) # # graphics::par(mfrow=c(1,2)) # # # MTL: AUC vs contamination # aucs <- lapply(auc_mtl,function(x) sapply(x,function(x) colMeans(x))) # auc <- sapply(aucs,function(x) rowMeans(x)) # graphics::plot(x=prob_mtl,auc["sparselink",],type="o",ylim=c(0.4,0.8),col="blue") # graphics::lines(x=prob_mtl,auc["wrap_separate",],type="o",col="red") # graphics::abline(h=0.5,lty=2,col="grey") # # # TL: source/target sample size # aucs <- lapply(auc_tl,function(x) sapply(x,function(x) x[2,])) # auc <- sapply(aucs,function(x) rowMeans(x)) # graphics::plot(x=prob_tl,auc["sparselink",],type="o",ylim=c(0.4,0.8),col="blue") # graphics::lines(x=prob_tl,auc["wrap_separate",],type="o",col="red") # graphics::abline(h=0.5,lty=2,col="grey") # # # statistical testing # # # ## ----multi-task,eval=FALSE---------------------------------------------------- # path <- "C:/Users/arauschenberger/Desktop/sparselink" # LIH (Windows) # #path <- "/Users/armin.rauschenberger/Desktop/LIH/sparselink" # LCSB (Mac) # # dir <- c("results","manuscript","package/R/functions.R") # for(i in seq_along(dir)){ # if(!dir.exists(file.path(path,dir[i]))&!file.exists(file.path(path,dir[i]))){ # stop(paste0("Require folder/file'",dir[i],"'.")) # } # } # source(file.path(path,"package/R/functions.R")) # Or load 'sparselink' package. # # inst <- rownames(utils::installed.packages()) # pkgs <- c("knitr","rmarkdown","glmnet","BiocManager","mvtnorm","glmtrans","spls","xrnet") # for(i in seq_along(pkgs)){ # if(!pkgs[i]%in%inst){ # utils::install.packages(pkgs[i]) # } # } # pkgs <- c("recount3","edgeR") # for(i in seq_along(pkgs)){ # if(!pkgs[i]%in%inst){ # BiocManager::install(pkgs[i]) # } # } # # blue <- "blue"; red <- "red" # # if(exists("sim.app")&exists("fig.tab")){ # if(!sim.app&fig.tab){ # files <- c("simulation_multiple.RData","simulation_transfer.RData","recount3_data.RData","explore_data.RData","application.RData") # for(i in seq_along(files)){ # if(!file.exists(file.path(path,"results",files[i]))){ # stop("File",files[i],"is missing.") # } # } # } # } # project <- list() # project$IBD <- c("Tew (2016)"="SRP063496", # "Haberman (2019)"="SRP129004", # "Verstockt (2019)"="ERP113396", # "Verstockt (2020)"="ERP114636", # "Boyd (2018)"="SRP100787") # project$RA <- c("Baker (2019)"="SRP169062", # "Moncrieffe (2017)"="SRP074736", # "Goldberg (2018)"="SRP155483") # extra <- c("Lewis (2019)"="ERP104864") # https://doi.org/10.1016/j.celrep.2019.07.091 # #<> # #<> # # load(file.path(path,"results/recount3_data.RData")) # # #- - - - - - - - - - - - - - - # #- - - extract features - - - # #- - - - - - - - - - - - - - - # # # extract features # x <- list() # for(i in c(unlist(project),extra)){ # counts <- t(SummarizedExperiment::assays(data[[i]])$raw_counts) # colnames(counts) <- SummarizedExperiment::rowRanges(data[[i]])$gene_name # x[[i]] <- counts # } # # # select most expressed protein-coding genes (for all TL projects together) # select <- list() # total <- numeric() # for(i in unlist(project)){ # #total <- rbind(total,Matrix::colSums(x[[i]])) # original: mean filtering # total <- rbind(total,apply(X=x[[i]],MARGIN=2,FUN=stats::var)) # trial: variance filtering # } # type <- SummarizedExperiment::rowData(data[[i]])$gene_type # cond <- type=="protein_coding" # total[,!cond] <- 0 # rank <- apply(X=total,MARGIN=1,FUN=rank) # mean_rank <- rowMeans(rank) # #temp <- cond & apply(total,2,function(x) all(x>0)) & (mean_rank >= sort(mean_rank[cond],decreasing=TRUE)[2000]) # original: top 2000 # temp <- cond & mean_rank >= sort(mean_rank[cond],decreasing=TRUE)[5000] # trial: top 5000 # # for(i in unlist(project)){ # select[[i]] <- temp # } # # # select most expressed protein-coding genes (for MTL project) # #mean <- apply(X=x[[extra]],MARGIN=2,FUN=mean) # original # var <- apply(X=x[[extra]],MARGIN=2,FUN=var) # trial # #warning("change number in next line") # #temp <- cond & mean >= sort(mean[cond],decreasing=TRUE)[5000] # trial: top 5000 # temp <- cond & var >= sort(var[cond],decreasing=TRUE)[5000] # trial: top 5000 # select[[extra]] <- temp # # # pre-processing # for(i in c(unlist(project),extra)){ # lib.size <- Matrix::rowSums(x[[i]]) # x[[i]] <- x[[i]][,select[[i]],drop=FALSE] # norm.factors <- edgeR::calcNormFactors(object=t(x[[i]]),lib.size=lib.size) # gamma <- norm.factors*lib.size/mean(lib.size) # gamma <- matrix(data=gamma,nrow=nrow(x[[i]]),ncol=ncol(x[[i]])) # x[[i]] <- x[[i]]/gamma # x[[i]] <- 2*sqrt(x[[i]] + 3/8) # Anscombe transform # x[[i]] <- scale(x[[i]]) # scale because of different datasets!? # } # # #- - - - - - - - - - - - - - # #- - - extract targets - - - # #- - - - - - - - - - - - - - # # # extract information on samples # frame <- list() # for(i in c(unlist(project),extra)){ # list <- strsplit(data[[i]]$sra.sample_attributes,split="\\|") # data[[i]]$sra.experiment_attributes # # What about sra.experiment_attributes? # n <- length(list) # cols <- unique(sapply(strsplit(unlist(list),split=";;"),function(x) x[1])) # ncol <- length(cols) # frame[[i]] <- matrix(data=NA,nrow=n,ncol=ncol,dimnames=list(rownames(x[[i]]),cols)) # for(j in seq_len(n)){ # for(k in seq_len(ncol)){ # vector <- list[[j]] # which <- which(substring(text=vector,first=1,last=nchar(cols[k]))==cols[k]) # string <- vector[which] # if(length(string)==0){next} # frame[[i]][j,k] <- strsplit(string,split=";;")[[1]][2] # } # } # frame[[i]] <- as.data.frame(frame[[i]]) # } # # # extract binary outcome # y <- z <- list() # for(i in unlist(project)){ # # CONTINUE HERE!!! # if(i=="ERP113396"){ # y[[i]] <- sapply(X=frame[[i]]$`clinical history`,FUN=function(x) switch(EXPR=x,"responder"=1,"non-responder"=0,stop("invalid"))) # } else if(i=="ERP114636"){ # y[[i]] <- sapply(X=frame[[i]]$`clinical information`,FUN=function(x) switch(EXPR=x,"response to vedolizumab therapy"=1-1,"no response to vedolizumab therapy"=0+1,stop("invalid"))) # warning("Inverting response and non-response!") # } else if(i=="SRP100787"){ # y[[i]] <- sapply(X=frame[[i]]$condition,FUN=function(x) switch(EXPR=x,"CD inactive"=1,"UC inactive"=1,"CD active"=0,"UC active"=0,control=NA,"NA"=NA,stop("invalid"))) # } else if(i=="SRP129004"){ # y[[i]] <- sapply(X=frame[[i]]$`week 4 remission`,FUN=function(x) switch(EXP=x,"Yes"=1,"No"=0,"NA"=NA,stop("invalid"))) # suppressWarnings(z[[i]] <- data.frame(pucai=as.numeric(frame[[i]]$pucai),mayo=as.numeric(frame[[i]]$`total mayo score`),histology=as.numeric(frame[[i]]$`histology severity score`))) # } else if(i=="SRP063496"){ # y[[i]] <- sapply(X=frame[[i]]$`remission at week 10`,FUN=function(x) switch(x, "Remitter"=1,"Non-remitter"=0,"N/A"=NA,stop("invalid"))) # } else if(i=="SRP169062"){ # y[[i]] <- sapply(X=frame[[i]]$`flare event`,FUN=function(x) switch(x,"no flare"=1,"flare"=0,stop("invalid"))) # } else if(i=="SRP155483"){ # y[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=1,"Low"=0,"Moderate"=0,"High"=0,"--"=NA,stop("invalid"))) # z[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=0,"Low"=1,"Moderate"=2,"High"=3,"--"=NA,stop("invalid"))) # } else if(i=="SRP074736"){ # y[[i]] <- sapply(X=frame[[i]]$`mtx response status`,FUN=function(x) switch(x,"responder"=1,"non-responder"=0,"control"=NA,stop("invalid"))) # } # } # # # overlap # for(j in unlist(project)){ # is.na <- is.na(y[[j]]) # if(length(is.na)!=nrow(x[[j]])){stop()} # y[[j]] <- y[[j]][!is.na] # if(!is.null(z[[j]])){ # if(is.vector(z[[j]])){ # z[[j]] <- z[[j]][!is.na] # } else { # z[[j]] <- z[[j]][!is.na,] # } # } # x[[j]] <- x[[j]][!is.na,] # } # source(file.path(path,"package/R/development.R")) # # #- - - - - - - - - - # #- - prepare data - - # #- - - - - - - - - - # # # explore SRP048801 # # project <- "ERP104864" # if(project=="SRP155483"){ # X <- x$SRP155483 # vars <- c("disease activity","das score") # Y <- frame$SRP155483[,vars] # sapply(unique(Y$`disease activity`),function(x) range(as.numeric(Y$`das score`[Y$`disease activity`==x]))) # warning("DAS score defines disease activity!") # problem <- list() # problem$unique <- colnames(Y) # } else if(project=="SRP129004"){ # X <- x$SRP129004 # vars <- c("histology severity score","total mayo score") # Y <- frame$SRP129004[,vars] # names <- intersect(rownames(X),rownames(Y)) # X <- X[names,] # Y <- Y[names,] # cond <- frame$SRP129004[names,"diagnosis"]=="Ulcerative Colitis" # X <- X[cond,] # Y <- Y[cond,] # problem <- list() # problem$unique <- colnames(Y) # } else if(project=="ERP104864"){ # X <- x$ERP104864 # vars <- c("CCP","CRP","crp","DAS28","ESR","esr","HAQ","VAS","SWOLLEN","TENDER") # "inflammatory score" dropped due to NA # "RF" dropped because binary # Y <- frame$ERP104864[,vars] # pathotype <- frame$ERP104864$pathotype # if(any(rownames(X)!=rownames(Y))){stop()} # #Y <- Y[Y$pathotype=="lymphoid",vars] # if(any(!is.na(Y$CRP)&!is.na(Y$crp))){ # stop("Invalid.") # } else { # Y$CRP[is.na(Y$CRP)] <- Y$crp[is.na(Y$CRP)] # Y$crp <- NULL # } # if(any(!is.na(Y$ESR)&!is.na(Y$esr))){ # stop("Invalid.") # } else { # Y$ESR[is.na(Y$ESR)] <- Y$esr[is.na(Y$ESR)] # Y$esr <- NULL # } # problem <- list() # #problem$joint <- c("SWOLLEN","TENDER") # #problem$proms <- c("VAS","HAQ") # #problem$labor <- c("CRP","ESR") # CCP might be too different (check cor) # problem$trial <- c("CRP","ESR","SWOLLEN") # #problem$trial <- c("DAS28","SWOLLEN","TENDER") # #problem$all <- colnames(y) # } # # for(i in seq_len(ncol(Y))){ # class(Y[[i]]) <- "numeric" # } # # cond <- apply(X=Y,MARGIN=1,FUN=function(x) all(!is.na(x))) #& pathotype=="lymphoid" # subtypes seem to be defined based on clinical scores # #Y <- as.matrix(Y)[cond,] # temporary: binarisation # y <- scale(as.matrix(Y)[cond,]) # x <- X[cond,] # # #- - - - - - - - - - - - - # #- - explore similarity - - # #- - - - - - - - - - - - - # # cor <- stats::cor(y,use="pairwise.complete.obs",method="spearman") # round(x=cor,digits=2) # graphics::image(t(cor)[,ncol(cor):1]) # #stats::heatmap(x=as.matrix(Y),Rowv=NA) # d <- stats::dist(x=t(Y)) # hclust <- stats::hclust(d=d) # graphics::plot(hclust) # # graphics::par(mfrow=c(2,4),mar=c(2,2,1,1)) # for(i in seq_len(ncol(y))){ # pvalue <- apply(x,2,function(x) cor.test(x,y[,i],method="spearman")$p.value) # graphics::hist(x=pvalue,main=colnames(y)[i]) # } # # #--- relationship between cor coef and cor p-value --- # coef <- stats::cor(y=y[,"CRP"],x=x,method="spearman") # pval <- apply(X=x,MARGIN=2,FUN=function(x) stats::cor.test(x=x,y=y[,"CRP"],method="spearman")$p.value) # graphics::plot(x=coef,y=sign(coef)*(-log10(pval))^0.5) # #--- conclusion: equivalent --- # # #--- examine correlation between ridge coefficients --- # coef <- matrix(data=NA,nrow=ncol(x),ncol=ncol(y),dimnames=list(NULL,colnames(y))) # for(i in seq_len(ncol(y))){ # object <- glmnet::cv.glmnet(x=x,y=y[,i],family="gaussian",alpha=0) # coef[,i] <- stats::coef(object=object,s="lambda.min")[-1] # } # round(stats::cor(coef,method="spearman"),digits=2) # #--- conclusion: some sets are strongly correlated --- # # #cor(rowSums(scale(Y)),Y,method="spearman") # # #- - - - - - - - - - - - - # #- - cross-validation - - # #- - - - - - - - - - - - - # # alpha.init <- 0.95; alpha <- 1; type <- "exp"; trial <- FALSE # #alpha.init <- NA; alpha <- 1; type <- "exp"; trial <- TRUE # results <- list() # for(k in seq_along(problem)){ # results[[k]] <- list() # for(i in seq_len(3)){ # 5 repetitions of 10-fold CV # set.seed(i) # method <- c("wrap_empty","wrap_separate","wrap_mgaussian","sparselink","wrap_spls") #"sparselink" "group.devel" # results[[k]][[i]] <- cv_multiple(y=y[,problem[[k]]],X=x,family="gaussian",method=method,nfolds=10,alpha=alpha,alpha.init=alpha.init,type=type,trial=trial) # } # } # # save(results,file=file.path(path,"results","app_mtl_devel.RData")) # # lapply(results,function(x) lapply(x,function(x) colMeans(x$deviance))) # # # colMeans(results[[1]][[1]]$deviance/results[[1]][[1]]$deviance[,"wrap_empty"]) # names(results) <- names(problem) # # colMeans(results$trial[[1]]$deviance/results$trial[[1]]$deviance[,"wrap_empty"]) # # lapply(results,function(x) rowMeans(sapply(x,function(x) rowMeans(apply(x$deviance,1,rank))))) # # object <- devel(y=y[,problem$trial],x=x,family="gaussian") # # cm <- numeric() # for(k in seq_along(problem)){ # for(i in seq_len(1)){ # dev <- results[[k]][[i]]$deviance # temp <- colMeans(dev/dev[,"wrap_separate"]) # cm <- rbind(cm,temp) # #cat(cm,"\n") # } # } # colMeans(cm) # # # separate, mgaussian, sparselink and devel have similar performance (i.e., MTL has no benefit with any of these three approaches), examine whether other approaches are better # # # examine whether group lasso for TL/MTL performs better # # # Consider using different candidate values, e.g., 0.01,0.5,1,1.5,2,10 also for sparselink. Removing 0 might be a good choice. (If 0 is not included, however, initial ridge regression might be better.) # # #- - - - - - - - - - - - # #- - learning curve - - # #- - - - - - - - - - - - # # size <- unique(c(seq(from=50,to=nrow(x),by=25),nrow(x))) # iter <- 10 # increase to 10 # size <- nrow(x) # suppress learning curve # # metric <- list() # setting <- expand.grid(problem=names(problem),size=size,iter=seq_len(iter)) # for(i in seq_len(nrow(setting))){ # cat(progress=paste0(100*i/nrow(setting),"%"),"\n") # cond <- sample(x=rep(x=c(FALSE,TRUE),times=c(nrow(x)-setting$size[i],setting$size[i]))) # vars <- problem[[setting$problem[i]]] # metric[[i]] <- joinet:::cv.joinet(Y=y[cond,vars],X=x[cond,],family="gaussian",compare="mnorm") # #method <- c("wrap_empty","wrap_separate","wrap_mgaussian","sparselink") # #metric[[i]] <- cv_multiple(y=y[cond,vars],X=x[cond,],family="gaussian",method=method,nfolds=10,alpha.init=0.95,alpha=1,type="exp",trial=FALSE) # } # frac <- sapply(X=metric,FUN=function(x) colMeans(t(x)/x["none",])) # # #frac <- sapply(X=metric,FUN=function(x) colMeans(x$deviance/x$deviance[,"wrap_empty"])) # # graphics::par(mfrow=c(1,length(problem)),mar=c(3,3,2,0.5)) # for(i in seq_along(problem)){ # graphics::plot.new() # graphics::plot.window(xlim=range(size),ylim=range(frac)) # graphics::box() # graphics::title(main=names(problem)[i]) # graphics::axis(side=1) # graphics::axis(side=2) # graphics::abline(h=1,col="grey",lty=2) # cond <- setting$problem==names(problem)[i] # col <- c(base="red",mnorm="orange",meta="blue") # #col <- c(wrap_separate="red",sparselink="blue") # for(j in names(col)){ # val <- tapply(X=frac[j,cond],INDEX=setting$size[cond],FUN=mean) # graphics::points(x=setting$size[cond],y=frac[j,cond],col=col[j]) # graphics::lines(x=size,y=val,col=col[j],type="o",pch=16) # } # } # # # JOINET is better than standard lasso when n>=100, performance is similar to spls and better than glmnet-mgaussian # # #--- --- --- --- --- --- # # binarisation --- # #--- --- --- --- --- --- # # # yy <- 1*cbind(y[,"CCP"]>20,y[,"CRP"]>10,y[,"DAS28"]>5) # metric <- list() # for(i in 1:10){ # metric[[i]] <- joinet:::cv.joinet(Y=yy,X=x,family="binomial") # } # # #rowMeans(sapply(metric,function(x) rowMeans(x))) # # #--- --- --- --- --- --- --- # #--- explore other datasets --- # #--- --- --- --- --- --- --- # # #projects <- recount3::available_projects(organism="human") # #recount3::read_metadata() # # # #data <- xlsx::read.xlsx(paste0(path,"/results/PPMI_Curated_Data_Cut_Public_20250321.xlsx"),sheetIndex=1) # data <- read.csv(paste0(path,"/results/PPMI_Curated_Data_Cut_Public_20250321.csv")) # # # baseline data (predictors) # x <- data[data$COHORT==1 & data$EVENT_ID=="BL",] # rownames(x) <- x$PATNO # prop.miss <- colMeans(is.na(x)) # x <- x[,prop.miss<=0.5 & sapply(x,class) %in% c("numeric","integer")] # x <- x[,which(colnames(x)=="age"):ncol(x)] # x <- missRanger::missRanger(data=x,pmm.k=3,num.trees=100,verbose=0,seed=1) # x <- scale(x) # # # follow-up data (outcomes) # visit <- c("V04","V06","V08") # y <- data[data$COHORT==1 & data$EVENT_ID %in% visit,] # colnames(y)[colnames(y)=="updrs_totscore"] <- "updrs" # vars <- c("moca","quip","updrs","gds","scopa","ess","bjlot","rem") # y <- y[,c("EVENT_ID","PATNO",vars)] # y <- reshape(data=y,idvar="PATNO",timevar="EVENT_ID",direction="wide") # rownames(y) <- y$PATNO; y$PATNO <- NULL # # # overlap # names <- intersect(rownames(x),rownames(y)) # x <- x[names,] # y <- y[names,] # y <- as.matrix(y) # #y <- scale(y) # # metric <- list() # for(i in seq_along(vars)){ # cat(vars[i],"\n") # cols <- paste0(vars[i],".",visit) # cond <- rowSums(is.na(y[,cols]))==0 # temporary # if(sum(cond)>250){cond[cumsum(cond)>250] <- FALSE} # temporary # metric[[i]] <- joinet:::cv.joinet(Y=y[cond,cols],X=x[cond,],family="gaussian",compare="mnorm") # #method <- c("wrap_empty","wrap_separate","wrap_mgaussian","sparselink","devel") # ,"sparselink") # #metric[[i]] <- cv_multiple(y=y[cond,cols],X=x[cond,],family="gaussian",method=method,nfolds=10,alpha.init=0.95,alpha=1,type="exp",trial=FALSE) # } # # #rowMeans(sapply(metric,function(x) colMeans(t(x)/x["none",]))) # rowMeans(sapply(metric,function(x) colMeans(x$deviance/x$deviance[,"wrap_empty"]))) # ## ----session_info,echo=FALSE,eval=fig.tab------------------------------------- # writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), sessioninfo::session_info()),con=paste0(path,"/results/info_figtab.txt"))