############################################################## # ========================================================= # An Evo-Devo Analysis in R: an example with cone snails # ========================================================= # by Nick Matzke (and whoever else adds to this PhyloWiki page) # Copyright 2011-infinity # matzkeATberkeley.edu # March 2011 # # Please link/cite if you use this, email me if you have # thoughts/improvements/corrections. # ############################################################## # # Copyright: this lab contains/uses unpublished research data & # analyses. # # IB200b students are of course free to re-use the code, others should # contact me about re-use of the code or data. # # This work was done in collaboration with Monty Slatkin, George # Oster, and Neil Gong. # ################################################################### # # Assignment for Tuesday, March 8 # # In this lab, I will take you through a real analysis I am # working on that is basically applying ancestral character # reconstruction methods and model selection to an evo-devo # question. # # George Oster's lab (Oster is a biophysicist) developed a shell- # growth model for cone snails that runs in Matlab. (Google cone # snails to look at some shells.) # # Basically, given a series of 24 input parameters, the model grows # a shell, complete with the pattern, which might be stripes, dots, # triangles, blank, etc. The first 19 parameters control various # signal and response curves that cells of the cone snail mantle, # under the model issue and receive, which then control the release # of pigment. # # The last 5 parameters deal with shell shape: # # kz, kr, and turn are from Raup's famous shell-shape model, and # control the # of turns, the angle, etc. of the shell # # N and T are starting conditions for the model (# of cells at the # growing edge, and the time point they start at). # # # # Once they built the model, Oster's lab then took a bunch of living # species, and figured out the parameters that grow patterns to match # each of these species. # # Oster then came over to Integrative Biology to "do some evolution # with this". Some obvious questions are: # # 1. Is there any phylogenetic signal in these parameters, which # after all are not direct measurements, but parameters that were # fit to a computer model by eye. # # 1a. Are the parameters statistically independent, or highly # correlated with each other? # # 2. Is Brownian motion a reasonable model under which to # conduct ancestral state reconstruction? # # 3. What model of evolution best describes each parameter? # Available models are: # - Brownian motion (BM); # # - the Ornstein−Uhlenbeck (OU) stabilizing selection model; # # - the lambda model which rescales internal branch lengths by # a linear fraction; # # - the kappa model (green) which rescales each branch length by a power # equal to the kappa parameter, and which becomes a speciational model # as kappa approaches 0; # # - delta (yellow) which focuses change towards the base or tips; # # - early burst (EB, cyan) which has an initial high rate of change that # then declines; # # - and white noise (white), where observations are produced by a normal # distribution with no tree structure, which represents the situation # of no phylogenetic signal. # # See references under the function fitContinuous. # # Assignment: # # Work through this code, and try to understand what each # stage does. # # Have a look at the final Likelihood Ratio Test results, and the model # comparison figure which uses AIC, and email me your verbal summary of # those results. E.g., which model is best, how confidence should the # reader be in those conclusions, and what are the implications in # terms of understanding the evolutionary processes involved, # and the likely accuracy of using Brownian motion for ancestral # character reconstruction. # # Files to download: # # Start by looking at the graphical file Neil_tree.png on the website. # This has photos of the cone snail species, and the matching shells # grown in the computer, on a DNA phylogeny. # # The parameters are stored in parameters_njm7.txt, which this script # takes as input, along with the DNA tree, tr.newick, and the # ultrametricized version, tr2.newick. # # Just to cut to the end, The final results figure is at: # tree_DNA_num_ML_inv2.png # ################################################################### # Stuff that is handy at the beginning of a script ###################################################### # R, close all devices/windows/ quartz windows graphics.off() # Turn off all BS stringsAsFactors silliness # R tends to read options(stringsAsFactors = FALSE) ###################################################### # Working directories: # One of the first things you want to do, usually, is # decide on your working directory # On my Mac, this is the directory wd = "/Users/nick/Desktop/_ib200a/ib200b_sp2011/lab10" # On a PC, you might have to specify paths like this: #wd = "c:\\Users\\nick\\Desktop\\_ib200a\\ib200b_sp2011\\lab10" #desc: set working directory setwd(wd) #desc: get working directory getwd() #desc: list the files in the directory list.files() # Sourcing scripts # Local sourcing: # source a script (windows machines require \\ instead of /) # sourcedir = '/_njm/' #source3 = '_genericR_v1.R' #source(paste(sourcedir, source3, sep="")) # You need the APE package to source genericR_v1.R scripts install.packages("ape") library(ape) # Remote sourcing: source("http://ib.berkeley.edu/courses/ib200b/scripts/_genericR_v1.R") source("http://ib.berkeley.edu/courses/ib200b/scripts/_conus_analyses_v1.R") source("http://ib.berkeley.edu/courses/ib200b/scripts/_R_tree_functions_v1.R") # Setup: # install these, if needed library(ape) library(phangorn) # helps with "midpoint" function library(picante) # for "Kcalc" and "phylosignal", i.e. calculation of the kappa statistic for continuous parameters library(geiger) # for e.g. fitContinuous ######################################### # Plot the original DNA tree ######################################### trfn = "tr.newick" tr = read.tree(trfn) plot(tr) title("Original DNA tree (digitized from Nam et al. 2009\nusing TreeRogue 0.1)\n(Note: axis scale is in digitization coordinates)") axisPhylo() add.scale.bar(x=0, y=23, length=1) # Get the parameters file params_orig = read_table_good("parameters_njm7.txt") # identify the types of params uniform1_params_names = c("s1g", "s2eg", "s2ig", "s3g", "sf2ia", "i1s1g", "i1s2eg", "i1s2ig", "i1s3g", "i1sf2ia", "i2s1g", "i2s2eg", "i2s2ig", "i2s3g", "i2sf2ia") continuous_params_names = c("s1n", "s1t", "s2en", "s2et", "s2in", "s2it", "s3n", "s3t", "sf2ea", "sf2ed", "sf2id", "tf2eb1", "tf2ec1", "tf2eb2", "tf2ec2", "tf2ib1", "tf2ic1", "tf2ib2", "tf2ic2") shell_params_names = c("kz", "kr", "turn") starting_params_names = c("N", "T") complex_params_names = c("s3n_std", "s3t_spd") clad_chars_names = c("stripes", "triangles", "dots", "color", "conical", "food") complex1_names = names(params_orig)[(ncol(params_orig)-25*2) : (ncol(params_orig)-26)] complex2_names = names(params_orig)[(ncol(params_orig)-25) : ncol(params_orig)] nonsucky_names = c(continuous_params_names, shell_params_names, starting_params_names, complex_params_names, clad_chars_names) all_cont_names = c(continuous_params_names, shell_params_names, starting_params_names) # remove the uniform params cols_tokeep = names(params_orig) %in% uniform1_params_names == FALSE all_params = params_orig[, cols_tokeep] row.names(all_params) = all_params$species (all_params) ################################### # Drop the tips we're not using ################################### all_params_orig = all_params #n1 = xy3$nodenames[xy3$nodenames != ""] n1 = tr$tip.label n2 = all_params$species n1_to_keep = n1 %in% n2 print(cbind(n1, n1_to_keep)) # drop the tips we're not using tr2 = drop.tip(tr, n1[n1_to_keep == FALSE]) #################################################################### # reorder the parameters to match the order of tips in the tree # (this is CRUCIAL!!) #################################################################### # tree tip order, from bottom to top: (labels1 = tr2$tip.label) # order of rows in char_dtf (labels2 = row.names(all_params)) # reorder char_dtf to match tip.labels new_order = match(labels1, labels2) all_params = all_params_orig[new_order, ] params = all_params # Cut params down to just continuous parameters for correlation analysis params = params[, all_cont_names] #################################### #pairs(params[, -1]) cps = cor(params) cps = as.data.frame(cps) names_cps = names(cps) #dim(params[, -1]) # Rank pairwise correlations pairwise_cors_list = c() for (i in 2:nrow(cps)) { if ((i+1) < ncol(cps)) { for (j in (i+1):ncol(cps)) { tmprow = c(names_cps[i], names_cps[j], cps[i,j], abs(cps[i, j])) pairwise_cors_list = rbind(pairwise_cors_list, tmprow) } } } row.names(pairwise_cors_list) = NULL pairwise_cors_list = as.data.frame(pairwise_cors_list) names(pairwise_cors_list) = c("x", "y", "cor", "abs_cor") #rank(pairwise_cors_list$abs_cor) order_cors = order(pairwise_cors_list$abs_cor, decreasing=TRUE) pairwise_cors_list = pairwise_cors_list[order_cors, ] print("Top correlations:") (pairwise_cors_list[1:20, ]) ###################################################### # Phylogenetically Independent Contrasts ###################################################### # we are going to use an ultrametric tree: chtr2 = read.tree("chtr2.newick") #chronogram(tr2, scale = 1, expo = 2, minEdgeLength = 1e-06) write.tree(chtr2, file="chtr2.newick") plot(chtr2) title("DNA tree, ultrametricized with APE's chronogram() function: NPRS\n(nonparametric rate smoothing), basically a least squares fit after Sanderson 1997") axisPhylo() add.scale.bar(x=0, y=0.9, length=0.1) pic_params = c() for (i in 1:ncol(params)) { x = pic(params[,i], chtr2) pic_params = cbind(pic_params, x) } pic_params = as.data.frame(pic_params) names(pic_params) = names(params) #pairs(pic_params) cor(pic_params) # Plot all params vs. all params tmpparams = params[, all_cont_names] tmpparams = dfnums_to_numeric(tmpparams) tmpparams[tmpparams == "?"] = NA pairs.panels(tmpparams, smooth=FALSE) title("Regular correlations") pairs.panels(pic_params, smooth=FALSE) title("PIC correlations") # Assemble the tables of correlations for params and PICs of params res1_txt_table = c() respic_txt_table = c() for (i in 1:ncol(params)) { for (j in i:ncol(params)) { if (i == j) { next() } cat("x=", names(params)[i], " vs. y=", names(params)[j], "\n", sep="") res1 = cor.test(params[, i], params[, j]) respic = cor.test(pic_params[, i], pic_params[, j]) #print() #print() lm_res1 = lm(params[, j] ~ params[, i]) lm_respic = lm(pic_params[, j] ~ pic_params[, i]) res1_txt = c(names(params)[i], names(params)[j], res1$statistic, res1$parameter, res1$p.value, res1$estimate, res1$null.value, res1$alternative, res1$method, res1$data.name, res1$conf.int[1], res1$conf.int[2], lm_res1$coefficients[2], lm_res1$coefficients[1]) respic_txt = c(names(params)[i], names(params)[j], respic$statistic, respic$parameter, respic$p.value, respic$estimate, respic$null.value, respic$alternative, respic$method, respic$data.name, respic$conf.int[1], respic$conf.int[2], lm_respic$coefficients[2], lm_respic$coefficients[1]) res1_txt_table = rbind(res1_txt_table, res1_txt) respic_txt_table = rbind(respic_txt_table, respic_txt) } } res1_txt_table = as.data.frame(res1_txt_table, row.names=1:nrow(res1_txt_table)) names(res1_txt_table) = (c("x", "y", "statistic_t", "parameter_df", "pval", "cor_estimate", "null", "alternative", "method", "data.name", "conf.int1", "conf.int2", "slope", "intercept")) respic_txt_table = as.data.frame(respic_txt_table, row.names=1:nrow(respic_txt_table)) names(respic_txt_table) = (c("x", "y", "statistic_t", "parameter_df", "pval", "cor_estimate", "null", "alternative", "method", "data.name", "conf.int1", "conf.int2", "slope", "intercept")) res1_txt_table = dfnums_to_numeric(res1_txt_table) respic_txt_table = dfnums_to_numeric(respic_txt_table) res1_txt_table$pval_bonf = res1_txt_table$pval * nrow(res1_txt_table) respic_txt_table$pval_bonf = respic_txt_table$pval * nrow(respic_txt_table) res1_txt_table = res1_txt_table[order(respic_txt_table$pval_bonf),] respic_txt_table = respic_txt_table[order(respic_txt_table$pval_bonf),] (res1_txt_table[1:20,]) (respic_txt_table[1:20,]) # sort by ascending p-value res1_txt_table_sort = res1_txt_table[order(res1_txt_table$pval), ] respic_txt_table_sort = respic_txt_table[order(respic_txt_table$pval), ] (res1_txt_table_sort[1:20,]) (respic_txt_table_sort[1:20,]) x = res1_txt_table$cor_estimate y = respic_txt_table$cor_estimate xlab = "r (regular)" ylab="r (PIC)" xlim = c(-1, 1) ylim = c(-1, 1) titlestart_txt = "correlation (r) estimates for pairwise correlations,\nnormal vs. phylogenetically independent contrasts;\n" basic_xy_plot(x, y, titlestart_txt, xlab, ylab, xlim, ylim) x = res1_txt_table$pval y = respic_txt_table$pval xlab="p-val (regular)" ylab="p-val (PIC)" xlim = c(0, 1) ylim = c(0, 1) titlestart_txt = "p-values for pairwise correlations,\nnormal vs. phylogenetically independent contrasts\n" basic_xy_plot(x, y, titlestart_txt, xlab, ylab, xlim, ylim) # Write the tables to text write.table(res1_txt_table, "res1_txt_table.txt") write.table(respic_txt_table, "respic_txt_table.txt") ############################################################ ##################################################### # Assess the parsimony stats of the discrete characters ##################################################### colnames_to_use = c(complex_params_names, clad_chars_names) char_dtf_orig = all_params[, colnames_to_use] char_dtf = char_dtf_orig # Make the final figure comparing the trees # quite fast #(parstats = parsim_stats_fast(tr2, dfnums_to_numeric(char_dtf))) (parstats = parsim_stats_fast(tr2, dfnums_to_numeric(char_dtf))) write_table_good(parstats, "parstats_v8.txt") # slower #(parstats = parsim_stats(tr2, char_dtf)) # Now, resample (takes a few seconds) num_resamps = 100 parstats_null = array(data=NA, dim=c(ncol(char_dtf)+1, 6, num_resamps+1)) for (i in 1:num_resamps) { tmpsamp = resample_dtf_matrix(char_dtf) parstats_null[, , i] = as.matrix(parsim_stats_fast(tr2, tmpsamp)) } parstats_null[, , num_resamps+1] = as.matrix(parstats) summary_parstats_means = parstats summary_parstats_sds = parstats summary_parstats_pvals = parstats # do histograms of the six different distances # set outer margins & number of subplots par(oma=c(1,1,5,1), mfrow=c(2,3)) for (i in 1:nrow(parstats)) { for (j in 1:ncol(parstats)) { x = parstats_null[i, j, ] summary_parstats_means[i, j] = mean(x) summary_parstats_sds[i, j] = sd(x) observed_val = parstats_null[i, j, num_resamps+1] pval = round(empirical_pval_fast(x, observed_val), 4) # redo pval depending on whether value should be low or high if (j > 3) { pval = round(1-pval, 4) } summary_parstats_pvals[i, j] = pval tmp_title = paste("non-parametric p-value = ", pval, sep="") h = hist(x, main=tmp_title, breaks=50, cex.main=1, xlab=names(parstats)[j]) # add the arrow arrowtop = 0.3*max(h$counts, na.rm=TRUE) arrowbot = 0.03*max(h$counts, na.rm=TRUE) arrows(observed_val, arrowtop, observed_val, arrowbot, col="blue", lwd=4) } titletxt = paste("Parsimony stats for ", row.names(parstats)[i], sep="") mtext(titletxt, outer=TRUE) } write_table_good(summary_parstats_means, "parstats_means_v8.txt") write_table_good(summary_parstats_sds, "parstats_sds_v8.txt") write_table_good(summary_parstats_pvals, "parstats_pvals_v8.txt") ####################################################### # Ancestral character reconstruction -- continuous, plots ####################################################### par(mfrow=c(1,2)) # character mapping # uniform1_params_names = c("s1g", "s2eg", "s2ig", "s3g", "sf2ia") # continuous_params_names = c("s1n", "s1t", "s2en", "s2et", "s2in", "s2it", "s3n", "s3t", "sf2ea", "sf2ed", "sf2id", "tf2eb1", "tf2ec1", "tf2eb2", "tf2ec2", "tf2ib1", "tf2ic1", "tf2ib2", "tf2ic2") # shell_params_names = c("kz", "kr", "turn") # starting_params_names = c("N", "T") # complex_params_names = c("s3n_std", "s3t_spd") # clad_chars_names # continuous parameter characters colnames_to_use = c(continuous_params_names, shell_params_names, starting_params_names) char_dtf_orig = all_params[, colnames_to_use] char_dtf = char_dtf_orig ################# # reorder char_dtf ################ # tree tip order, from bottom to top: #(labels1 = chtr2$tip.label) # order of rows in char_dtf #(labels2 = row.names(char_dtf)) # reorder char_dtf to match tip.labels #new_order = match(labels1, labels2) #char_dtf = char_dtf[new_order, ] par(mfrow=c(6,4)) for (colnum in 1:ncol(char_dtf)) { hist(as.numeric(char_dtf[,colnum]), 17, main=title(names(char_dtf)[colnum]) ) } par(mfrow=c(1,1)) ########################################################################### # Compare different models for the evolution of this character ########################################################################### runthis4 = TRUE if (runthis4 == TRUE) { # models_list = c("BM", "OU", "lambda", "kappa", "delta", "EB", "white", "trend") # cut "trend" model, doesn't matter for ultrametric trees (same as BM model, evidently) models_list = c("BM", "OU", "lambda", "kappa", "delta", "EB", "white") model_fits = NULL # GET THE VARIOUS PARAMETERS ALSO # beta (typically the rate) # alpha (central tendency on BM walk that is proportional to alpha) # Lambda model # multiplies all internal branches of the tree by lambda, leaving tip # branches as their original length # # kappa raises all branch lengths to the power kappa. As kappa approaches # zero, the model becomes speciational. # # (apparently, kappa is saved as "lamda" in the kappa model) # delta (delta>1, evo tipwards, < 1, evo at base) # EB (a = proportion of initial rate represented by the end rate (?)) # White Noise (draw from mean/variance) # mean = mean # nv = variance (?) # THESE BOUNDS PRODUCE PATHOLOGICAL RESULTS (on my first dataset) bounds_list = NULL # bounds_list$BM = list(beta=c(1,100)) # bounds_list$OU = list(alpha=c(0.0001, 10), beta=c(1, 100)) # bounds_list$lambda = list(beta=c(1,100)) # bounds_list$kappa = list(beta=c(1,100)) # bounds_list$EB = list(beta=c(1,100)) # bounds_list$white = NULL # THESE BOUNDS ARE THE DEFAULTS bounds_list = NULL # db = default bounds #db_beta = c(1e-08, 20) #db_lambda = c(1e-07, 1) #db_kappa = c(1e-06, 1) #db_delta = c(1e-05, 2.999999) #db_alpha = c(1e-07, 50) #db_a = c(-3, 0) #db_nv = c(1e-10, 100) #db_mu = c(-100, 100) # YOU SHOULD ALWAYS CHECK IF YOUR ESTIMATES HIT THE BOUNDS, AND # THEN RE-ADJUST bounds_list = NULL # db = default bounds db_beta = c(1e-08, 1e08) db_alpha = c(1e-07, 50) db_lambda = c(1e-07, 1) db_kappa = c(1e-06, 1) db_delta = c(1e-05, 2.9999) db_a = c(-3, 0) db_nv = c(1e-10, 1000000) #db_mu = c(-100, 100) bounds_list$BM = list(beta=db_beta) bounds_list$OU = list(beta=db_beta, alpha=db_alpha) bounds_list$lambda = list(beta=db_beta, lambda=db_lambda) bounds_list$kappa = list(beta=db_beta, lambda=db_kappa) bounds_list$EB = list(beta=db_beta, a=db_a) bounds_list$white = list(nv=db_nv) # bounds_list$model = list(mu=db_mu) for (i in 1:length(models_list)) { # Chose the right bounds cmdstr = paste("tmp_bounds = bounds_list$", models_list[i], sep="") eval(parse(text = cmdstr)) model_fit = fitContinuous(chtr2, char_dtf, model=models_list[i], bounds = tmp_bounds) cmdstr = paste("model_fits$", models_list[i], " = model_fit", sep="") eval(parse(text = cmdstr)) } # Save R object to a file save(model_fits, file="model_fits.Rdata") } else { load(file="model_fits.Rdata") } model_fits_orig = model_fits model_fits = model_fits_orig # Get log likelihoods and AICs results_list = c("lnl", "aic", "aicc", "k", "beta", "alpha", "lambda", "delta", "a", "mean", "nv") # fill in blank results for (item in results_list) { cmdstr = paste(item, "_matrix = matrix(NA, nrow=length(models_list), ncol=length(all_cont_names))", sep="") eval(parse(text = cmdstr)) } # put in the data from the model results for (i in 1:length(models_list)) { for (j in 1:length(all_cont_names)) { for (item in results_list) { cmdstr = paste("tmp_item = model_fits$", models_list[i], "$", all_cont_names[j], "$", item, sep="") eval(parse(text = cmdstr)) if (is.null(tmp_item)) { cmdstr = paste(item, "_matrix[i,j] = NA", sep="") eval(parse(text = cmdstr)) } else { cmdstr = paste(item, "_matrix[i,j] = tmp_item", sep="") eval(parse(text = cmdstr)) } } } } # Apply the row & column names for (item in results_list) { cmdstr = paste(item, "_matrix = adf(", item, "_matrix)", sep="") eval(parse(text = cmdstr)) cmdstr = paste("names(", item, "_matrix) = names(char_dtf)", sep="") eval(parse(text = cmdstr)) cmdstr = paste("row.names(", item, "_matrix) = models_list", sep="") eval(parse(text = cmdstr)) # write to txt file cmdstr = paste("write_table_good(", item, "_matrix, '", item, "_matrix.txt')", sep="") eval(parse(text = cmdstr)) } lnl_matrix aic_matrix aicc_matrix k_matrix beta_matrix alpha_matrix lambda_matrix delta_matrix a_matrix mean_matrix nv_matrix # Compare the models using AIC # Another way: use AIC # For each col, subtract the min diff_aic_matrix = aic_matrix for (colnum in 1:ncol(aic_matrix)) { diff_aic_matrix[, colnum] = aic_matrix[, colnum] - min(aic_matrix[, colnum], na.rm=TRUE) } cat("Table of difference-from-best-AIC values; 0 = best model\n") print(diff_aic_matrix, digits=2) # Citation for this calculation: # http://www.brianomeara.info/tutorials/aic # # or: # # aic_wi = exp(1) ^ (-0.5 * diff_aic_matrix) colsum_aic_wi = apply(aic_wi, 2, sum, na.rm=TRUE) # calculate the relative weights aic_weights = aic_wi for (colnum in 1:ncol(aic_wi)) { aic_weights[,colnum] = aic_wi[,colnum] / colsum_aic_wi[colnum] } # cat("These are the AIC weights, calculated according to Burnham & Anderson (2002).\n") print(aic_weights, digits=2) # flip aic_weights for graphing aic_weights[nrow(aic_weights):1, ] = aic_weights row.names(aic_weights) = row.names(aic_weights)[nrow(aic_weights):1] ##################################### # Plot AIC weights, with labels ##################################### # What are the best models under AIC best_model_list = rownames(aic_weights) best_models = rep(NA, length(colnames(aic_weights))) for (colnum in 1:length(colnames(aic_weights))) { TF_row = aic_weights[,colnum] == max(aic_weights[,colnum]) best_models[colnum] = best_model_list[TF_row] } print(best_models) # Get results #pdf(file = "conus_cors_all_vs_all_v3.pdf", width=15, height=15) doPDFs2 = TRUE if (doPDFs2 == TRUE) { pdffn = "Figure_SX_model_compare_v8.pdf" pdf(file=pdffn, width=8, height=10, paper="USr") } par(oma=c(1,1,1,1), mar=c(15.6, 4.1, 4.1, 7.1), mfrow = c(1,1)) par(xpd=TRUE) tmp_colors = c("white", "cyan", "yellow", "green2", "orange", "red", "brown") tmp_xnames = rep("", length(colnames(aic_weights))) legend_text = rev(c("Brownian", "O-U", "lambda", "kappa", "delta", "early burst", "white noise")) tmp_legend = list(x=38, y=0.9, pt.cex=4) # don't plot y-axis here par(yaxt="n") bar_x_positions = barplot(aic_weights, names.arg=tmp_xnames, col=tmp_colors, ylab="Proportion of AIC weight", xlab="shell growth parameter", legend.text=legend_text, args.legend=tmp_legend) # plot y-axis here par(yaxt="s") ticks = c(0, 0.25, 0.5, 0.75, 1) axis(side=2, at=ticks, line=0, labels=ticks, lty = "solid", lwd = 1) ## Create plot and get bar midpoints in 'mp' #bar_x_positions = barplot(1:length(colnames(aic_weights))) ## Set up x axis with tick marks alone #axis(1, at = bar_x_positions, labels = FALSE) # Create some text labels tmp_x_labels = colnames(aic_weights) # Plot x axis labels at mp text(x=bar_x_positions-0.1, y=-0.015, adj=0, srt=315, labels = tmp_x_labels, xpd=TRUE) # Plot the best models across the top text(x=bar_x_positions-0.1, y=1.015, adj=0, srt=45, labels = best_models, xpd=TRUE) toptxt = "(Best model under AIC)" text(x=15, y=1.14, labels=toptxt) caption = "Figure SX: Akaike weights of 7 models for the evolution of shell parameters. The models \ncompared are: Brownian motion (BM, brown); the Ornstein-Uhlenbeck (OU, red) \nstabilizing selection model; the lambda model (orange) which rescales internal branch \nlengths by a linear fraction; the kappa model (green) which rescales each branch length \nby a power equal to the kappa parameter, and which becomes a speciational model as \nkappa approaches 0; delta (yellow) which focuses change towards the base or tips; \nearly burst (EB, cyan) which has an initial high rate of change that then declines; and white \nnoise (white), where observations are produced by a normal distribution with no tree \nstructure, which represents the situation of no phylogenetic signal. Brownian motion (BM) \nhas the highest AIC weight for 63% of the shell parameters (15/24). White noise \n(no phylogenetic signal) is superior for 25% (6/24) shell parameters." text(x=-5.4, y=-0.56, labels=caption, adj=0, xpd=TRUE) ########################################################## # Turn off the PDF and open if (doPDFs2 == TRUE) { dev.off() cmdstr = paste("open ", pdffn, sep="") system(cmdstr) } ########################################################## # Do LRT on the different models ########################################################## # Compare all models to BM pchisq_table = matrix(NA, nrow=5, ncol=ncol(lnl_matrix)) for (colnum in 1:ncol(lnl_matrix)) { tmp_lnls = lnl_matrix[, colnum] # Calculate the log-likelihood differences from # the Brownian motion model # # You can't compare white noise, since it's not nested tmp_lnls_diffs = tmp_lnls[2:(length(tmp_lnls)-1)] - tmp_lnls[1] # take twice the difference tmp_lnls_twice_diffs = 2 * tmp_lnls_diffs dim(tmp_lnls_twice_diffs) = c(1, length(tmp_lnls_twice_diffs)) # Do chi-squared test with 1 degree of freedom tmp_pchisq = apply(tmp_lnls_twice_diffs, 1, pchisq, 1, lower.tail=FALSE) pchisq_table[,colnum] = tmp_pchisq #print(tmp_pchisq) } pchisq_table = adf(pchisq_table) names(pchisq_table) = names(lnl_matrix) row.names(pchisq_table) = row.names(lnl_matrix)[2:(nrow(lnl_matrix)-1)] print(pchisq_table, digits=2) write_table_good(pchisq_table, "pchisq_table.txt") # 11 / 120 times there is a model significantly better than BM pchisq_table < 0.05 sum(pchisq_table < 0.05) # 5/ 24 params reject BM significantly: # sf2id, tf2ib1, tf2ib2, kr, turn BM_rejected = apply(pchisq_table < 0.05, 2, sum) # All false with Bonferroni correction, but that's probably not fair pchisq_table < (0.05 / (nrow(pchisq_table) * ncol(pchisq_table)) ) ########################################################## #x = dtf_to_phyDat(char_dtf) txt_suffix = "" # ACE methods: # a character specifying the method used for estimation. Three choices are possible: "ML", "pic", or "GLS". # ACE with ML method # BM = brownian motion num_internal_nodes = chtr2$Nnode PICcont_means = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) ) PICcont_lower025 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) ) PICcont_upper975 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) ) for (i in 1:ncol(char_dtf)) { PICcont = ace(char_dtf[,i], chtr2, type="continuous", method="pic", CI=TRUE, model="BM") PICcont_means[, i] = PICcont$ace PICcont_lower025[, i] = PICcont$CI95[, 1] PICcont_upper975[, i] = PICcont$CI95[, 2] } PICcont_means = adf(PICcont_means) names(PICcont_means) = names(char_dtf) tree_dtf = prt(chtr2) row.names(PICcont_means) = tree_dtf$node[ (length(chtr2$tip.label)+1): length(tree_dtf$node)] PICcont_lower025 = adf(PICcont_lower025) names(PICcont_lower025) = names(char_dtf) row.names(PICcont_lower025) = row.names(PICcont_means) PICcont_upper975 = adf(PICcont_upper975) names(PICcont_upper975) = names(char_dtf) row.names(PICcont_upper975) = row.names(PICcont_means) write_table_good_w_rownames(PICcont_means, "PICcont_means_v8.txt") write_table_good_w_rownames(PICcont_lower025, "PICcont_lower025_v8.txt") write_table_good_w_rownames(PICcont_upper975, "PICcont_upper975_v8.txt") # plot tip & node values to check them # plot tip & node values to check them txt1 = "PIC" txt2 = txt_suffix for (i in 1:ncol(char_dtf)) { recon_txt = paste("ACE technique: ", txt1, txt2, sep="") tmptipvals = char_dtf[,i] tmpnodevals = PICcont_means[,i] maxval = max(c(tmptipvals, tmpnodevals)) texttxt = paste(recon_txt, "\nParameter: ", names(char_dtf)[i], "\nMaximum value = ", maxval, sep="") plot_cont_vals_on_tree(chtr2, tmptipvals, tmpnodevals, texttxt) } MLcont_means = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) ) MLcont_lower025 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) ) MLcont_upper975 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) ) Blombergs_Kstats = c() PIC.variance.obs = c() PIC.variance.rnd.mean = c() PIC.variance.P = c() PIC.variance.Z = c() for (i in 1:ncol(char_dtf)) { MLcont = ace(char_dtf[,i], chtr2, type="continuous", method="ML", CI=TRUE, model="BM") phylosignals = phylosignal(char_dtf[,i], chtr2) Blombergs_Kstats = c(Blombergs_Kstats, phylosignals$K) PIC.variance.obs = c(PIC.variance.obs, phylosignals$PIC.variance.obs) PIC.variance.rnd.mean = c(PIC.variance.rnd.mean, phylosignals$PIC.variance.rnd.mean) PIC.variance.P = c(PIC.variance.P, phylosignals$PIC.variance.P) PIC.variance.Z = c(PIC.variance.Z, phylosignals$PIC.variance.Z) MLcont_means[, i] = MLcont$ace MLcont_lower025[, i] = MLcont$CI95[, 1] MLcont_upper975[, i] = MLcont$CI95[, 2] } MLcont_means = adf(MLcont_means) names(MLcont_means) = names(char_dtf) tree_dtf = prt(chtr2) row.names(MLcont_means) = tree_dtf$node[ (length(chtr2$tip.label)+1): length(tree_dtf$node)] MLcont_lower025 = adf(MLcont_lower025) names(MLcont_lower025) = names(char_dtf) row.names(MLcont_lower025) = row.names(MLcont_means) MLcont_upper975 = adf(MLcont_upper975) names(MLcont_upper975) = names(char_dtf) row.names(MLcont_upper975) = row.names(MLcont_means) # Bonferroni correction PIC.variance.P_bonferroni = PIC.variance.P * length(Blombergs_Kstats) phylosignal_table = cbind(Blombergs_Kstats, PIC.variance.obs, PIC.variance.rnd.mean, PIC.variance.Z, PIC.variance.P, PIC.variance.P_bonferroni) row.names(phylosignal_table) = names(char_dtf) (phylosignal_table) write_table_good_w_rownames(MLcont_means, "MLcont_means_v8.txt") write_table_good_w_rownames(MLcont_lower025, "MLcont_lower025_v8.txt") write_table_good_w_rownames(MLcont_upper975, "MLcont_upper975_v8.txt") write_table_good_w_rownames(phylosignal_table, "phylosignal_table_v8.txt") # plot tip & node values to check them txt1 = "ML" txt2 = txt_suffix for (i in 1:ncol(char_dtf)) { recon_txt = paste("ACE technique: ", txt1, txt2, sep="") tmptipvals = char_dtf[,i] tmpnodevals = PICcont_means[,i] maxval = max(c(tmptipvals, tmpnodevals)) texttxt = paste(recon_txt, "\nParameter: ", names(char_dtf)[i], "\nMaximum value = ", maxval, sep="") plot_cont_vals_on_tree(chtr2, tmptipvals, tmpnodevals, texttxt) } GLScont_means = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) ) GLScont_lower025 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) ) GLScont_upper975 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) ) for (i in 1:ncol(char_dtf)) { GLScont = ace(char_dtf[,i], chtr2, type="continuous", method="GLS", CI=TRUE, corStruct = corBrownian(1, chtr2)) GLScont_means[, i] = GLScont$ace GLScont_lower025[, i] = GLScont$CI95[, 1] GLScont_upper975[, i] = GLScont$CI95[, 2] } GLScont_means = adf(GLScont_means) names(GLScont_means) = names(char_dtf) tree_dtf = prt(chtr2) row.names(GLScont_means) = tree_dtf$node[ (length(chtr2$tip.label)+1): length(tree_dtf$node)] GLScont_lower025 = adf(GLScont_lower025) names(GLScont_lower025) = names(char_dtf) row.names(GLScont_lower025) = row.names(GLScont_means) GLScont_upper975 = adf(GLScont_upper975) names(GLScont_upper975) = names(char_dtf) row.names(GLScont_upper975) = row.names(GLScont_means) write_table_good_w_rownames(GLScont_means, "GLScont_means_v8.txt") write_table_good_w_rownames(GLScont_lower025, "GLScont_lower025_v8.txt") write_table_good_w_rownames(GLScont_upper975, "GLScont_upper975_v8.txt") # write the tree out to a table for Excel tmptree_table = prt(chtr2) tmptree_table$daughter_nds = as.character(tmptree_table$daughter_nds) #tmptree_table = tmptree_table[, !(names(tmptree_table) == "daughter_nds")] write_table_good(tmptree_table, "chtr2_v8.txt") # plot tip & node values to check them txt1 = "GLS" txt2 = txt_suffix for (i in 1:ncol(char_dtf)) { recon_txt = paste("ACE technique: ", txt1, txt2, sep="") tmptipvals = char_dtf[,i] tmpnodevals = PICcont_means[,i] maxval = max(c(tmptipvals, tmpnodevals)) texttxt = paste(recon_txt, "\nParameter: ", names(char_dtf)[i], "\nMaximum value = ", maxval, sep="") plot_cont_vals_on_tree(chtr2, tmptipvals, tmpnodevals, texttxt) } # Compare the estimates against each other with pairs plots for (i in 1:ncol(char_dtf)) { tmpcols = cbind(PICcont_means[,i], MLcont_means[,i], GLScont_means[,i]) tmpcols = adf(tmpcols) names(tmpcols) = c("ACE_pic", "ACE_ML", "ACE_GLS") pairs.panels(tmpcols) titletxt = paste("param #", i, ": ", names(char_dtf)[i], sep="") title(titletxt, line=-1.35, outer=TRUE) } ####################################################### # Reconstruct the ancestors -- discrete, plots # discrete parameter characters ####################################################### colnames_to_use = c(complex_params_names, clad_chars_names) char_dtf = all_params[, colnames_to_use] num_uniq = apply(char_dtf, 2, function(x) length(unique(x))) # parsimony reconstruction # # https://stat.ethz.ch/pipermail/r-sig-phylo/2010-January/000565.html # Hi Santiago, # # Look at the ace function in the Ape package - to get a max parsimony # reconstruction set branch lengths =1 and run a maximum likelihood # estimation. You can find further information here: http://www.r-phylo.org/wiki/HowTo/Ancestral_State_Reconstruction # #x = dtf_to_phyDat(char_dtf) # make a parsimony tree: pars_tr = chtr2 # set all branch lengths to 1 pars_tr$edge.length[!is.na(pars_tr$edge.length)] = 1 #ace(char_dtf[,1], pars_tr, type="discrete", method="ML", model="ER") par(mfrow=c(1,2)) ACE_pars_estimates = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) ) for (i in 1:ncol(char_dtf)) { (ACE_pars = ace(char_dtf[,i], pars_tr, type="discrete", method="ML", model="ER", ip=0.3)) # convert to parsimony maxpar_node_vals = ACE_pars$lik.anc maxpar_node_maxes = apply(maxpar_node_vals, 1, max) maxpar_node_vals = adf(maxpar_node_vals) maxpar_node_MP = array(NA, dim=c(num_internal_nodes, 1)) for (j in 1:nrow(maxpar_node_vals)) { tmpTFs = maxpar_node_vals[j,] == maxpar_node_maxes[j] names(tmpTFs) colnum = which(tmpTFs) if (length(colnum) > 1) { tmplist = c() for (k in 1:length(colnum)) { tmplist[k] = names(maxpar_node_vals)[colnum[k]] } tmpstr = paste(tmplist, collapse="&") maxpar_node_MP[j] = tmpstr } else { maxpar_node_MP[j] = names(maxpar_node_vals)[colnum] } } ACE_pars_estimates[, i] = maxpar_node_MP # plot results tree_ht = get_max_height_tree(chtr2) label_offset = 0.05*tree_ht plot(chtr2, show.node.label=FALSE, cex=0.9, no.margin = TRUE, x.lim=1.6*tree_ht, label.offset=label_offset) if (num_uniq[i] == 2) { colors = c("yellow", "blue") tmpcols = char_dtf[,i] tmpcols[tmpcols == 0] = colors[1] tmpcols[tmpcols == 1] = colors[2] } if (num_uniq[i] == 3) { colors = c("yellow", "blue", "red") tmpcols = char_dtf[,i] tmpcols[tmpcols == 0] = colors[1] tmpcols[tmpcols == 1] = colors[2] tmpcols[tmpcols == 2] = colors[3] } tiplabels(pch = 19, col = tmpcols, cex = 2) #adj = 0, legend(0, 3, legend=as.character(0:max(char_dtf[,i])), fill=colors) nodelabels(thermo=ACE_pars$lik.anc, cex=1, piecol=colors) titletxt = paste("ML, parsimony-like reconstruction of\n", names(char_dtf)[i], " character", sep="") title(titletxt, cex=0.5) } ACE_pars_estimates = adf(ACE_pars_estimates) names(ACE_pars_estimates) = names(char_dtf) pars_tr_dtf = prt(pars_tr) row.names(ACE_pars_estimates) = tree_dtf$node[ (length(pars_tr$tip.label)+1): length(pars_tr_dtf$node)] #row.names(ACE_pars_estimates) = row.names(char_dtf) (ACE_pars_estimates) write_table_good(ACE_pars_estimates, "ACE_pars_estimates_v8.txt") # Do ML reconstruction, with varying branch lengths (i.e. not branches = 1 like "parsimony") par(mfrow=c(1,2)) ACE_ML_estimates = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) ) for (i in 1:ncol(char_dtf)) { (ACE_ML = ace(char_dtf[,i], chtr2, type="discrete", method="ML", model="ER", ip=0.3)) # convert to the single best ML estimate (plurality vote) maxML_node_vals = ACE_ML$lik.anc maxML_node_maxes = apply(maxML_node_vals, 1, max) maxML_node_vals = adf(maxML_node_vals) maxML_node_best = array(NA, dim=c(num_internal_nodes, 1)) for (j in 1:nrow(maxML_node_vals)) { tmpTFs = round(maxML_node_vals[j,], 3) == round(maxML_node_maxes[j], 3) names(tmpTFs) colnum = which(tmpTFs) if (length(colnum) > 1) { tmplist = c() for (k in 1:length(colnum)) { tmplist[k] = names(maxML_node_vals)[colnum[k]] } tmpstr = paste(tmplist, collapse="&") maxML_node_best[j] = tmpstr } else { maxML_node_best[j] = names(maxML_node_vals)[colnum] } } #cbind(maxML_node_vals, maxML_node_maxes, maxML_node_best) ACE_ML_estimates[, i] = maxML_node_best # plot results tree_ht = get_max_height_tree(chtr2) label_offset = 0.05*tree_ht plot(chtr2, show.node.label=FALSE, cex=0.9, no.margin = TRUE, x.lim=1.6*tree_ht, label.offset=label_offset) if (num_uniq[i] == 2) { colors = c("yellow", "blue") tmpcols = char_dtf[,i] tmpcols[tmpcols == 0] = colors[1] tmpcols[tmpcols == 1] = colors[2] } if (num_uniq[i] == 3) { colors = c("yellow", "blue", "red") tmpcols = char_dtf[,i] tmpcols[tmpcols == 0] = colors[1] tmpcols[tmpcols == 1] = colors[2] tmpcols[tmpcols == 2] = colors[3] } tiplabels(pch = 19, col = tmpcols, cex = 2) #adj = 0, legend(0, 3, legend=as.character(0:max(char_dtf[,i])), fill=colors) nodelabels(thermo=ACE_ML$lik.anc, cex=1, piecol=colors) titletxt = paste("ML, equal rates reconstruction of\n", names(char_dtf)[i], " character", sep="") title(titletxt, cex=0.5) } ACE_ML_estimates = adf(ACE_ML_estimates) names(ACE_ML_estimates) = names(char_dtf) pars_tr_dtf = prt(pars_tr) row.names(ACE_ML_estimates) = tree_dtf$node[ (length(pars_tr$tip.label)+1): length(pars_tr_dtf$node)] #row.names(ACE_ML_estimates) = row.names(char_dtf) (ACE_ML_estimates) write_table_good(ACE_ML_estimates, "ACE_ML_estimates_v8.txt") # Write out the trees we generates, also... write.tree(tr, "tr.newick") write.tree(tr2, "tr2.newick") write.tree(chtr2, "chtr2.newick")