##############################################################
# =========================================================
# An Evo-Devo Analysis in R: an example with cone snails
# =========================================================
# by Nick Matzke (and whoever else adds to this PhyloWiki page)
# Copyright 2011-infinity
# matzkeATberkeley.edu
# March 2011
#
# Please link/cite if you use this, email me if you have 
#   thoughts/improvements/corrections.
#
##############################################################
#
# Copyright: this lab contains/uses unpublished research data & 
# analyses.
# 
# IB200b students are of course free to re-use the code, others should 
# contact me about re-use of the code or data.
#
# This work was done in collaboration with Monty Slatkin, George
# Oster, and Neil Gong.
# 
###################################################################
#
# Assignment for Tuesday, March 8
#
# In this lab, I will take you through a real analysis I am
# working on that is basically applying ancestral character
# reconstruction methods and model selection to an evo-devo
# question.
#
# George Oster's lab (Oster is a biophysicist) developed a shell-
# growth model for cone snails that runs in Matlab.  (Google cone
# snails to look at some shells.)
#
# Basically, given a series of 24 input parameters, the model grows
# a shell, complete with the pattern, which might be stripes, dots,
# triangles, blank, etc.  The first 19 parameters control various 
# signal and response curves that cells of the cone snail mantle,
# under the model issue and receive, which then control the release
# of pigment.
#
# The last 5 parameters deal with shell shape:
#
# kz, kr, and turn are from Raup's famous shell-shape model, and 
# control the # of turns, the angle, etc. of the shell
#
# N and T are starting conditions for the model (# of cells at the 
# growing edge, and the time point they start at).
#
#
#
# Once they built the model, Oster's lab then took a bunch of living
# species, and figured out the parameters that grow patterns to match
# each of these species.
# 
# Oster then came over to Integrative Biology to "do some evolution
# with this".  Some obvious questions are:
#
# 1. Is there any phylogenetic signal in these parameters, which 
# after all are not direct measurements, but parameters that were
# fit to a computer model by eye.
#
# 1a. Are the parameters statistically independent, or highly 
# correlated with each other?
#
# 2. Is Brownian motion a reasonable model under which to 
# conduct ancestral state reconstruction?
#
# 3. What model of evolution best describes each parameter? 
# Available models are:
# - Brownian motion (BM);
#
# - the Ornstein−Uhlenbeck (OU) stabilizing selection model;
#
# - the lambda model which rescales internal branch lengths by
#   a linear fraction;
#
# - the kappa model (green) which rescales each branch length by a power
#   equal to the kappa parameter, and which becomes a speciational model
#   as kappa approaches 0;
#
# - delta (yellow) which focuses change towards the base or tips;
#
# - early burst (EB, cyan) which has an initial high rate of change that
#   then declines;
#
# - and white noise (white), where observations are produced by a normal
#   distribution with no tree structure, which represents the situation
#   of no phylogenetic signal.
#
# See references under the function fitContinuous.
#
# Assignment: 
# 
# Work through this code, and try to understand what each 
# stage does.
#
# Have a look at the final Likelihood Ratio Test results, and the model
# comparison figure which uses AIC, and email me your verbal summary of
# those results.  E.g., which model is best, how confidence should the 
# reader be in those conclusions, and what are the implications in 
# terms of understanding the evolutionary processes involved,
# and the likely accuracy of using Brownian motion for ancestral
# character reconstruction.
#
# Files to download:
#
# Start by looking at the graphical file Neil_tree.png on the website.
# This has photos of the cone snail species, and the matching shells
# grown in the computer, on a DNA phylogeny.
# 
# The parameters are stored in parameters_njm7.txt, which this script
# takes as input, along with the DNA tree, tr.newick, and the 
# ultrametricized version, tr2.newick.
#
# Just to cut to the end, The final results figure is at:
# tree_DNA_num_ML_inv2.png
#
###################################################################


# Stuff that is handy at the beginning of a script
######################################################
# R, close all devices/windows/ quartz windows
graphics.off()

# Turn off all BS stringsAsFactors silliness
# R tends to read 
options(stringsAsFactors = FALSE)
######################################################


# Working directories:
# One of the first things you want to do, usually, is 
# decide on your working directory

# On my Mac, this is the directory
wd = "/Users/nick/Desktop/_ib200a/ib200b_sp2011/lab10"

# On a PC, you might have to specify paths like this:
#wd = "c:\\Users\\nick\\Desktop\\_ib200a\\ib200b_sp2011\\lab10"

#desc: set working directory
setwd(wd)

#desc: get working directory
getwd()

#desc: list the files in the directory
list.files()


# Sourcing scripts
# Local sourcing:
# source a script (windows machines require \\ instead of /)
# sourcedir = '/_njm/'
#source3 = '_genericR_v1.R'
#source(paste(sourcedir, source3, sep=""))

# You need the APE package to source genericR_v1.R scripts
install.packages("ape")
library(ape)

# Remote sourcing:
source("http://ib.berkeley.edu/courses/ib200b/scripts/_genericR_v1.R")
source("http://ib.berkeley.edu/courses/ib200b/scripts/_conus_analyses_v1.R")
source("http://ib.berkeley.edu/courses/ib200b/scripts/_R_tree_functions_v1.R")

# Setup:
# install these, if needed
library(ape)
library(phangorn)  	# helps with "midpoint" function
library(picante)  	# for "Kcalc" and "phylosignal", i.e. calculation of the kappa statistic for continuous parameters
library(geiger) 	# for e.g. fitContinuous


#########################################
# Plot the original DNA tree
#########################################
trfn = "tr.newick"
tr = read.tree(trfn)
plot(tr)
title("Original DNA tree (digitized from Nam et al. 2009\nusing TreeRogue 0.1)\n(Note: axis scale is in digitization coordinates)")
axisPhylo()
add.scale.bar(x=0, y=23, length=1)


# Get the parameters file
params_orig = read_table_good("parameters_njm7.txt")


# identify the types of params
uniform1_params_names = c("s1g", "s2eg", "s2ig", "s3g", "sf2ia", "i1s1g", "i1s2eg", "i1s2ig", "i1s3g", "i1sf2ia", "i2s1g", "i2s2eg", "i2s2ig", "i2s3g", "i2sf2ia")
continuous_params_names = c("s1n", "s1t", "s2en", "s2et", "s2in", "s2it", "s3n", "s3t", "sf2ea", "sf2ed", "sf2id", "tf2eb1", "tf2ec1", "tf2eb2", "tf2ec2", "tf2ib1", "tf2ic1", "tf2ib2", "tf2ic2")
shell_params_names = c("kz", "kr", "turn")
starting_params_names = c("N", "T")
complex_params_names = c("s3n_std", "s3t_spd")
clad_chars_names = c("stripes", "triangles", "dots", "color", "conical", "food")
complex1_names = names(params_orig)[(ncol(params_orig)-25*2) : (ncol(params_orig)-26)]
complex2_names = names(params_orig)[(ncol(params_orig)-25) : ncol(params_orig)]
nonsucky_names = c(continuous_params_names, shell_params_names, starting_params_names, complex_params_names, clad_chars_names)
all_cont_names = c(continuous_params_names, shell_params_names, starting_params_names)

# remove the uniform params
cols_tokeep = names(params_orig) %in% uniform1_params_names == FALSE

all_params = params_orig[, cols_tokeep]
row.names(all_params) = all_params$species
(all_params)


###################################
# Drop the tips we're not using
###################################
all_params_orig = all_params
#n1 = xy3$nodenames[xy3$nodenames != ""]
n1 = tr$tip.label
n2 = all_params$species
n1_to_keep = n1 %in% n2
print(cbind(n1, n1_to_keep))

# drop the tips we're not using
tr2 = drop.tip(tr, n1[n1_to_keep == FALSE])


####################################################################
# reorder the parameters to match the order of tips in the tree
# (this is CRUCIAL!!)
####################################################################
# tree tip order, from bottom to top:
(labels1 = tr2$tip.label)

# order of rows in char_dtf
(labels2 = row.names(all_params))

# reorder char_dtf to match tip.labels
new_order = match(labels1, labels2)
all_params = all_params_orig[new_order, ]

params = all_params

# Cut params down to just continuous parameters for correlation analysis
params = params[, all_cont_names]

####################################

#pairs(params[, -1])
cps = cor(params)
cps = as.data.frame(cps)
names_cps = names(cps)
#dim(params[, -1])

# Rank pairwise correlations
pairwise_cors_list = c()
for (i in 2:nrow(cps))
	{
	if ((i+1) < ncol(cps))
		{
		for (j in (i+1):ncol(cps))
			{
			tmprow = c(names_cps[i], names_cps[j], cps[i,j], abs(cps[i, j]))
			pairwise_cors_list = rbind(pairwise_cors_list, tmprow)
			}
		}
	}
row.names(pairwise_cors_list) = NULL
pairwise_cors_list = as.data.frame(pairwise_cors_list)
names(pairwise_cors_list) = c("x", "y", "cor", "abs_cor")
#rank(pairwise_cors_list$abs_cor)
order_cors = order(pairwise_cors_list$abs_cor, decreasing=TRUE)

pairwise_cors_list = pairwise_cors_list[order_cors, ]

print("Top correlations:")
(pairwise_cors_list[1:20, ])


######################################################
# Phylogenetically Independent Contrasts
######################################################
# we are going to use an ultrametric tree:
chtr2 = read.tree("chtr2.newick")
#chronogram(tr2, scale = 1, expo = 2, minEdgeLength = 1e-06)
write.tree(chtr2, file="chtr2.newick")
plot(chtr2)
title("DNA tree, ultrametricized with APE's chronogram() function: NPRS\n(nonparametric rate smoothing), basically a least squares fit after Sanderson 1997")
axisPhylo()
add.scale.bar(x=0, y=0.9, length=0.1)


pic_params = c()
for (i in 1:ncol(params))
	{
	x = pic(params[,i], chtr2)
	pic_params = cbind(pic_params, x)
	}

pic_params = as.data.frame(pic_params)
names(pic_params) = names(params)

#pairs(pic_params)
cor(pic_params)


# Plot all params vs. all params
tmpparams = params[, all_cont_names]
tmpparams = dfnums_to_numeric(tmpparams)
tmpparams[tmpparams == "?"] = NA
pairs.panels(tmpparams, smooth=FALSE)
title("Regular correlations")
pairs.panels(pic_params, smooth=FALSE)
title("PIC correlations")


# Assemble the tables of correlations for params and PICs of params
res1_txt_table = c()
respic_txt_table = c()
for (i in 1:ncol(params))
	{
	for (j in i:ncol(params))
		{
		if (i == j)
			{
			next()
			}
		cat("x=", names(params)[i], "		vs.		y=", names(params)[j], "\n", sep="")
		
		res1 = cor.test(params[, i], params[, j])
		respic = cor.test(pic_params[, i], pic_params[, j])
		#print()
		#print()
		
		lm_res1 = lm(params[, j] ~ params[, i])
		lm_respic = lm(pic_params[, j] ~ pic_params[, i])
		
		res1_txt = c(names(params)[i], names(params)[j], res1$statistic, res1$parameter, res1$p.value, res1$estimate, res1$null.value, res1$alternative, res1$method, res1$data.name, res1$conf.int[1], res1$conf.int[2], lm_res1$coefficients[2], lm_res1$coefficients[1])
		
		respic_txt = c(names(params)[i], names(params)[j], respic$statistic, respic$parameter, respic$p.value, respic$estimate, respic$null.value, respic$alternative, respic$method, respic$data.name, respic$conf.int[1], respic$conf.int[2], lm_respic$coefficients[2], lm_respic$coefficients[1])
		
		
		res1_txt_table = rbind(res1_txt_table, res1_txt)
		respic_txt_table = rbind(respic_txt_table, respic_txt)
		}
	}

res1_txt_table = as.data.frame(res1_txt_table, row.names=1:nrow(res1_txt_table))
names(res1_txt_table) = (c("x", "y", "statistic_t", "parameter_df", "pval", "cor_estimate", "null", "alternative", "method", "data.name", "conf.int1", "conf.int2", "slope", "intercept"))

respic_txt_table = as.data.frame(respic_txt_table, row.names=1:nrow(respic_txt_table))
names(respic_txt_table) = (c("x", "y", "statistic_t", "parameter_df", "pval", "cor_estimate", "null", "alternative", "method", "data.name", "conf.int1", "conf.int2", "slope", "intercept"))

res1_txt_table = dfnums_to_numeric(res1_txt_table)
respic_txt_table = dfnums_to_numeric(respic_txt_table)

res1_txt_table$pval_bonf = res1_txt_table$pval * nrow(res1_txt_table)
respic_txt_table$pval_bonf = respic_txt_table$pval * nrow(respic_txt_table)

res1_txt_table = res1_txt_table[order(respic_txt_table$pval_bonf),]
respic_txt_table = respic_txt_table[order(respic_txt_table$pval_bonf),]

(res1_txt_table[1:20,])
(respic_txt_table[1:20,])

# sort by ascending p-value
res1_txt_table_sort = res1_txt_table[order(res1_txt_table$pval), ]
respic_txt_table_sort = respic_txt_table[order(respic_txt_table$pval), ]

(res1_txt_table_sort[1:20,])
(respic_txt_table_sort[1:20,])


x = res1_txt_table$cor_estimate
y = respic_txt_table$cor_estimate
xlab = "r (regular)"
ylab="r (PIC)"
xlim = c(-1, 1)
ylim = c(-1, 1)
titlestart_txt = "correlation (r) estimates for pairwise correlations,\nnormal vs. phylogenetically independent contrasts;\n"
basic_xy_plot(x, y, titlestart_txt, xlab, ylab, xlim, ylim)


x = res1_txt_table$pval
y = respic_txt_table$pval
xlab="p-val (regular)"
ylab="p-val (PIC)"
xlim = c(0, 1)
ylim = c(0, 1)
titlestart_txt = "p-values for pairwise correlations,\nnormal vs. phylogenetically independent contrasts\n"
basic_xy_plot(x, y, titlestart_txt, xlab, ylab, xlim, ylim)


# Write the tables to text
write.table(res1_txt_table, "res1_txt_table.txt")
write.table(respic_txt_table, "respic_txt_table.txt")
############################################################


#####################################################
# Assess the parsimony stats of the discrete characters
#####################################################


colnames_to_use = c(complex_params_names, clad_chars_names)
char_dtf_orig = all_params[, colnames_to_use]
char_dtf = char_dtf_orig


# Make the final figure comparing the trees


# quite fast
#(parstats = parsim_stats_fast(tr2, dfnums_to_numeric(char_dtf)))
(parstats = parsim_stats_fast(tr2, dfnums_to_numeric(char_dtf)))
write_table_good(parstats, "parstats_v8.txt")

# slower
#(parstats = parsim_stats(tr2, char_dtf))


# Now, resample (takes a few seconds)
num_resamps = 100
parstats_null = array(data=NA, dim=c(ncol(char_dtf)+1, 6, num_resamps+1))
for (i in 1:num_resamps)
	{
	tmpsamp = resample_dtf_matrix(char_dtf)
	parstats_null[, , i] = as.matrix(parsim_stats_fast(tr2, tmpsamp))
	}
parstats_null[, , num_resamps+1] = as.matrix(parstats)

summary_parstats_means = parstats
summary_parstats_sds = parstats
summary_parstats_pvals = parstats

# do histograms of the six different distances
# set outer margins & number of subplots
par(oma=c(1,1,5,1), mfrow=c(2,3))
for (i in 1:nrow(parstats))
	{
	for (j in 1:ncol(parstats))
		{
		x = parstats_null[i, j, ]
		summary_parstats_means[i, j] = mean(x)
		summary_parstats_sds[i, j] = sd(x)
		
		observed_val = parstats_null[i, j, num_resamps+1]
		pval = round(empirical_pval_fast(x, observed_val), 4)
		
		# redo pval depending on whether value should be low or high
		if (j > 3)
			{
			pval = round(1-pval, 4)
			}
		
		summary_parstats_pvals[i, j] = pval
		
		tmp_title = paste("non-parametric p-value = ", pval, sep="")
		h = hist(x, main=tmp_title, breaks=50, cex.main=1, xlab=names(parstats)[j])
		
		# add the arrow
		arrowtop = 0.3*max(h$counts, na.rm=TRUE)
		arrowbot = 0.03*max(h$counts, na.rm=TRUE)
		arrows(observed_val, arrowtop, observed_val, arrowbot, col="blue", lwd=4)	
		}
	titletxt = paste("Parsimony stats for ", row.names(parstats)[i], sep="")
	mtext(titletxt, outer=TRUE)
	}

write_table_good(summary_parstats_means, "parstats_means_v8.txt")
write_table_good(summary_parstats_sds, "parstats_sds_v8.txt")
write_table_good(summary_parstats_pvals, "parstats_pvals_v8.txt")


#######################################################
# Ancestral character reconstruction --  continuous, plots
#######################################################

par(mfrow=c(1,2))


# character mapping
# uniform1_params_names = c("s1g", "s2eg", "s2ig", "s3g", "sf2ia")
# continuous_params_names = c("s1n", "s1t", "s2en", "s2et", "s2in", "s2it", "s3n", "s3t", "sf2ea", "sf2ed", "sf2id", "tf2eb1", "tf2ec1", "tf2eb2", "tf2ec2", "tf2ib1", "tf2ic1", "tf2ib2", "tf2ic2")
# shell_params_names = c("kz", "kr", "turn")
# starting_params_names = c("N", "T")
# complex_params_names = c("s3n_std", "s3t_spd")
# clad_chars_names


# continuous parameter characters
colnames_to_use = c(continuous_params_names, shell_params_names, starting_params_names)
char_dtf_orig = all_params[, colnames_to_use]
char_dtf = char_dtf_orig


#################
# reorder char_dtf
################
# tree tip order, from bottom to top:
#(labels1 = chtr2$tip.label)
# order of rows in char_dtf
#(labels2 = row.names(char_dtf))
# reorder char_dtf to match tip.labels
#new_order = match(labels1, labels2)
#char_dtf = char_dtf[new_order, ]

par(mfrow=c(6,4))
for (colnum in 1:ncol(char_dtf))
	{
	hist(as.numeric(char_dtf[,colnum]), 17, main=title(names(char_dtf)[colnum]) )
	}
	
par(mfrow=c(1,1))


###########################################################################
# Compare different models for the evolution of this character
###########################################################################
runthis4 = TRUE
if (runthis4 == TRUE)
	{
	# models_list = c("BM", "OU", "lambda", "kappa", "delta", "EB", "white", "trend")
	# cut "trend" model, doesn't matter for ultrametric trees (same as BM model, evidently)
	models_list = c("BM", "OU", "lambda", "kappa", "delta", "EB", "white")
	model_fits = NULL

	# GET THE VARIOUS PARAMETERS ALSO
	
	# beta (typically the rate)
	# alpha (central tendency on BM walk that is proportional to alpha)
	
	# Lambda model
	# multiplies all internal branches of the tree by lambda, leaving tip
	# branches as their original length
	# 
	# kappa raises all branch lengths to the power kappa. As kappa approaches
	# zero, the model becomes speciational.
	#
	# (apparently, kappa is saved as "lamda" in the kappa model)
	
	# delta (delta>1, evo tipwards, < 1, evo at base)
	
	# EB (a = proportion of initial rate represented by the end rate (?))
	
	# White Noise (draw from mean/variance)
	# mean = mean
	# nv = variance (?)

	
	# THESE BOUNDS PRODUCE PATHOLOGICAL RESULTS (on my first dataset)
	bounds_list = NULL
	# bounds_list$BM = list(beta=c(1,100))
	# bounds_list$OU = list(alpha=c(0.0001, 10), beta=c(1, 100))
	# bounds_list$lambda = list(beta=c(1,100))
	# bounds_list$kappa = list(beta=c(1,100))
	# bounds_list$EB = list(beta=c(1,100))
	# bounds_list$white = NULL


	# THESE BOUNDS ARE THE DEFAULTS
	bounds_list = NULL
	
	# db = default bounds
	#db_beta = c(1e-08, 20)
	#db_lambda = c(1e-07, 1)
	#db_kappa = c(1e-06, 1)
	#db_delta = c(1e-05, 2.999999)
	#db_alpha = c(1e-07, 50)
	#db_a = c(-3, 0)
	#db_nv = c(1e-10, 100)
	#db_mu = c(-100, 100)


	# YOU SHOULD ALWAYS CHECK IF YOUR ESTIMATES HIT THE BOUNDS, AND
	# THEN RE-ADJUST
	bounds_list = NULL
	
	# db = default bounds
	db_beta = c(1e-08, 1e08)
	db_alpha = c(1e-07, 50)
	db_lambda = c(1e-07, 1)
	db_kappa = c(1e-06, 1)
	db_delta = c(1e-05, 2.9999)
	db_a = c(-3, 0)
	db_nv = c(1e-10, 1000000)
	#db_mu = c(-100, 100)


	bounds_list$BM = list(beta=db_beta)
	bounds_list$OU = list(beta=db_beta, alpha=db_alpha)
	bounds_list$lambda = list(beta=db_beta, lambda=db_lambda)
	bounds_list$kappa = list(beta=db_beta, lambda=db_kappa)
	bounds_list$EB = list(beta=db_beta, a=db_a)
	bounds_list$white = list(nv=db_nv)
	# bounds_list$model = list(mu=db_mu)


	for (i in 1:length(models_list))
		{
		# Chose the right bounds
		cmdstr = paste("tmp_bounds = bounds_list$", models_list[i], sep="")
		eval(parse(text = cmdstr))

		model_fit = fitContinuous(chtr2, char_dtf, model=models_list[i], bounds = tmp_bounds)
		cmdstr = paste("model_fits$", models_list[i], " = model_fit", sep="")
		eval(parse(text = cmdstr))
		}
	# Save R object to a file
	save(model_fits, file="model_fits.Rdata")
	} else {
	load(file="model_fits.Rdata")
	}


model_fits_orig = model_fits
model_fits = model_fits_orig

# Get log likelihoods and AICs
results_list = c("lnl", "aic", "aicc", "k", "beta", "alpha", "lambda", "delta", "a", "mean", "nv")

# fill in blank results
for (item in results_list)
	{
	cmdstr = paste(item, "_matrix = matrix(NA, nrow=length(models_list), ncol=length(all_cont_names))", sep="")
	eval(parse(text = cmdstr))	
	}
	
# put in the data from the model results
for (i in 1:length(models_list))
	{
	for (j in 1:length(all_cont_names))
		{
		for (item in results_list)
			{
			cmdstr = paste("tmp_item = model_fits$", models_list[i], "$", all_cont_names[j], "$", item, sep="")
			eval(parse(text = cmdstr))
			if (is.null(tmp_item))
				{
				cmdstr = paste(item, "_matrix[i,j] = NA", sep="")
				eval(parse(text = cmdstr))	
				} else {
				cmdstr = paste(item, "_matrix[i,j] = tmp_item", sep="")
				eval(parse(text = cmdstr))
				}
			}
		}
	}


# Apply the row & column names
for (item in results_list)
	{
	cmdstr = paste(item, "_matrix = adf(", item, "_matrix)", sep="")
	eval(parse(text = cmdstr))
	
	cmdstr = paste("names(", item, "_matrix) = names(char_dtf)", sep="")
	eval(parse(text = cmdstr))
	
	cmdstr = paste("row.names(", item, "_matrix) = models_list", sep="")
	eval(parse(text = cmdstr))
	
	# write to txt file
	cmdstr = paste("write_table_good(", item, "_matrix, '", item, "_matrix.txt')", sep="")
	eval(parse(text = cmdstr))
	}


lnl_matrix
aic_matrix
aicc_matrix
k_matrix
beta_matrix
alpha_matrix
lambda_matrix
delta_matrix
a_matrix
mean_matrix
nv_matrix


# Compare the models using AIC
# Another way: use AIC
# For each col, subtract the min

diff_aic_matrix = aic_matrix

for (colnum in 1:ncol(aic_matrix))
	{
	diff_aic_matrix[, colnum] = aic_matrix[, colnum] - min(aic_matrix[, colnum], na.rm=TRUE)
	}

cat("Table of difference-from-best-AIC values; 0 = best model\n")
print(diff_aic_matrix, digits=2)


# Citation for this calculation:
# http://www.brianomeara.info/tutorials/aic
# 
# or:
#
# 
aic_wi = exp(1) ^ (-0.5 * diff_aic_matrix)
colsum_aic_wi = apply(aic_wi, 2, sum, na.rm=TRUE)

# calculate the relative weights
aic_weights = aic_wi
for (colnum in 1:ncol(aic_wi))
	{
	aic_weights[,colnum] = aic_wi[,colnum] / colsum_aic_wi[colnum]
	}

# 
cat("These are the AIC weights, calculated according to Burnham & Anderson (2002).\n")
print(aic_weights, digits=2)

# flip aic_weights for graphing
aic_weights[nrow(aic_weights):1, ] = aic_weights
row.names(aic_weights) = row.names(aic_weights)[nrow(aic_weights):1]


#####################################
# Plot AIC weights, with labels
#####################################
# What are the best models under AIC
best_model_list = rownames(aic_weights)

best_models = rep(NA, length(colnames(aic_weights)))
for (colnum in 1:length(colnames(aic_weights)))
	{
	TF_row = aic_weights[,colnum] == max(aic_weights[,colnum])
	best_models[colnum] = best_model_list[TF_row]
	}
print(best_models)


# Get results
#pdf(file = "conus_cors_all_vs_all_v3.pdf", width=15, height=15)
doPDFs2 = TRUE
if (doPDFs2 == TRUE)
	{
	pdffn = "Figure_SX_model_compare_v8.pdf"
	pdf(file=pdffn, width=8, height=10, paper="USr")
	}

par(oma=c(1,1,1,1), mar=c(15.6, 4.1, 4.1, 7.1), mfrow = c(1,1))
par(xpd=TRUE)
tmp_colors = c("white", "cyan", "yellow", "green2", "orange", "red", "brown")
tmp_xnames = rep("", length(colnames(aic_weights)))
legend_text = rev(c("Brownian", "O-U", "lambda", "kappa", "delta", "early burst", "white noise"))
tmp_legend = list(x=38, y=0.9, pt.cex=4)

# don't plot y-axis here
par(yaxt="n")
bar_x_positions = barplot(aic_weights, names.arg=tmp_xnames, col=tmp_colors, ylab="Proportion of AIC weight", xlab="shell growth parameter", legend.text=legend_text, args.legend=tmp_legend)

# plot y-axis here
par(yaxt="s")
ticks = c(0, 0.25, 0.5, 0.75, 1)
axis(side=2, at=ticks, line=0, labels=ticks, lty = "solid", lwd = 1)

## Create plot and get bar midpoints in 'mp'
#bar_x_positions = barplot(1:length(colnames(aic_weights)))

## Set up x axis with tick marks alone
#axis(1, at = bar_x_positions, labels = FALSE)

# Create some text labels
tmp_x_labels = colnames(aic_weights)

# Plot x axis labels at mp
text(x=bar_x_positions-0.1, y=-0.015, adj=0, srt=315, labels = tmp_x_labels, xpd=TRUE)

# Plot the best models across the top
text(x=bar_x_positions-0.1, y=1.015, adj=0, srt=45, labels = best_models, xpd=TRUE)

toptxt = "(Best model under AIC)"
text(x=15, y=1.14, labels=toptxt)

caption = "Figure SX: Akaike weights of 7 models for the evolution of shell parameters.  The models \ncompared are: Brownian motion (BM, brown); the Ornstein-Uhlenbeck (OU, red) \nstabilizing selection model; the lambda model (orange) which rescales internal branch \nlengths by a linear fraction; the kappa model (green) which rescales each branch length \nby a power equal to the kappa parameter, and which becomes a speciational model as \nkappa approaches 0; delta (yellow) which focuses change towards the base or tips; \nearly burst (EB, cyan) which has an initial high rate of change that then declines; and white \nnoise (white), where observations are produced by a normal distribution with no tree  \nstructure, which represents the situation of no phylogenetic signal. Brownian motion (BM) \nhas the highest AIC weight for 63% of the shell parameters (15/24).  White noise \n(no phylogenetic signal) is superior for 25% (6/24) shell parameters."

text(x=-5.4, y=-0.56, labels=caption, adj=0, xpd=TRUE)

##########################################################

# Turn off the PDF and open
if (doPDFs2 == TRUE)
	{
	dev.off()
	cmdstr = paste("open ", pdffn, sep="")
	system(cmdstr)
	}


##########################################################
# Do LRT on the different models
##########################################################
# Compare all models to BM
pchisq_table = matrix(NA, nrow=5, ncol=ncol(lnl_matrix))

for (colnum in 1:ncol(lnl_matrix))
	{
	tmp_lnls = lnl_matrix[, colnum]
	
	# Calculate the log-likelihood differences from 
	# the Brownian motion model
	#
	# You can't compare white noise, since it's not nested
	tmp_lnls_diffs = tmp_lnls[2:(length(tmp_lnls)-1)] - tmp_lnls[1]
	
	# take twice the difference
	tmp_lnls_twice_diffs = 2 * tmp_lnls_diffs
	dim(tmp_lnls_twice_diffs) = c(1, length(tmp_lnls_twice_diffs))
	
	# Do chi-squared test with 1 degree of freedom
	tmp_pchisq = apply(tmp_lnls_twice_diffs, 1, pchisq, 1, lower.tail=FALSE)
	
	pchisq_table[,colnum] = tmp_pchisq
	
	#print(tmp_pchisq)
	}


pchisq_table = adf(pchisq_table)
names(pchisq_table) = names(lnl_matrix)
row.names(pchisq_table) = row.names(lnl_matrix)[2:(nrow(lnl_matrix)-1)]

print(pchisq_table, digits=2)

write_table_good(pchisq_table, "pchisq_table.txt")


# 11 / 120 times there is a model significantly better than BM
pchisq_table < 0.05
sum(pchisq_table < 0.05)

# 5/ 24 params reject BM significantly:
# sf2id, tf2ib1, tf2ib2, kr, turn
BM_rejected = apply(pchisq_table < 0.05, 2, sum)


# All false with Bonferroni correction, but that's probably not fair
pchisq_table < (0.05 / (nrow(pchisq_table) * ncol(pchisq_table)) )

##########################################################


#x = dtf_to_phyDat(char_dtf)
txt_suffix = ""


# ACE methods:
# a character specifying the method used for estimation. Three choices are possible: "ML", "pic", or "GLS".

# ACE with ML method
# BM = brownian motion
num_internal_nodes = chtr2$Nnode


PICcont_means = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
PICcont_lower025 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
PICcont_upper975 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )

for (i in 1:ncol(char_dtf))
	{
	PICcont = ace(char_dtf[,i], chtr2, type="continuous", method="pic", CI=TRUE, model="BM")
	PICcont_means[, i] = PICcont$ace
	PICcont_lower025[, i] = PICcont$CI95[, 1]
	PICcont_upper975[, i] = PICcont$CI95[, 2]

	}
PICcont_means = adf(PICcont_means)
names(PICcont_means) = names(char_dtf)
tree_dtf = prt(chtr2)
row.names(PICcont_means) = tree_dtf$node[ (length(chtr2$tip.label)+1): length(tree_dtf$node)]
PICcont_lower025 = adf(PICcont_lower025)
names(PICcont_lower025) = names(char_dtf)
row.names(PICcont_lower025) = row.names(PICcont_means)
PICcont_upper975 = adf(PICcont_upper975)
names(PICcont_upper975) = names(char_dtf)
row.names(PICcont_upper975) = row.names(PICcont_means)

write_table_good_w_rownames(PICcont_means, "PICcont_means_v8.txt")
write_table_good_w_rownames(PICcont_lower025, "PICcont_lower025_v8.txt")
write_table_good_w_rownames(PICcont_upper975, "PICcont_upper975_v8.txt")


# plot tip & node values to check them


# plot tip & node values to check them
txt1 = "PIC"
txt2 = txt_suffix
for (i in 1:ncol(char_dtf))
	{
	recon_txt = paste("ACE technique: ", txt1, txt2, sep="")
	
	tmptipvals = char_dtf[,i]
	tmpnodevals = PICcont_means[,i]
	maxval = max(c(tmptipvals, tmpnodevals))
	texttxt = paste(recon_txt, "\nParameter: ", names(char_dtf)[i], "\nMaximum value = ", maxval, sep="") 

	plot_cont_vals_on_tree(chtr2, tmptipvals, tmpnodevals, texttxt)
	}


MLcont_means = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
MLcont_lower025 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
MLcont_upper975 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
Blombergs_Kstats = c()
PIC.variance.obs = c()
PIC.variance.rnd.mean = c()
PIC.variance.P = c()
PIC.variance.Z = c()


for (i in 1:ncol(char_dtf))
	{
	MLcont = ace(char_dtf[,i], chtr2, type="continuous", method="ML", CI=TRUE, model="BM")
	phylosignals = phylosignal(char_dtf[,i], chtr2)
	Blombergs_Kstats = c(Blombergs_Kstats, phylosignals$K)
	PIC.variance.obs = c(PIC.variance.obs, phylosignals$PIC.variance.obs)
	PIC.variance.rnd.mean = c(PIC.variance.rnd.mean, phylosignals$PIC.variance.rnd.mean)
	PIC.variance.P = c(PIC.variance.P, phylosignals$PIC.variance.P)
	PIC.variance.Z = c(PIC.variance.Z, phylosignals$PIC.variance.Z)

	MLcont_means[, i] = MLcont$ace
	MLcont_lower025[, i] = MLcont$CI95[, 1]
	MLcont_upper975[, i] = MLcont$CI95[, 2]
	}
MLcont_means = adf(MLcont_means)
names(MLcont_means) = names(char_dtf)
tree_dtf = prt(chtr2)
row.names(MLcont_means) = tree_dtf$node[ (length(chtr2$tip.label)+1): length(tree_dtf$node)]
MLcont_lower025 = adf(MLcont_lower025)
names(MLcont_lower025) = names(char_dtf)
row.names(MLcont_lower025) = row.names(MLcont_means)
MLcont_upper975 = adf(MLcont_upper975)
names(MLcont_upper975) = names(char_dtf)
row.names(MLcont_upper975) = row.names(MLcont_means)

# Bonferroni correction
PIC.variance.P_bonferroni = PIC.variance.P * length(Blombergs_Kstats)
phylosignal_table = cbind(Blombergs_Kstats, PIC.variance.obs, PIC.variance.rnd.mean, PIC.variance.Z, PIC.variance.P, PIC.variance.P_bonferroni)
row.names(phylosignal_table) = names(char_dtf)
(phylosignal_table)

write_table_good_w_rownames(MLcont_means, "MLcont_means_v8.txt")
write_table_good_w_rownames(MLcont_lower025, "MLcont_lower025_v8.txt")
write_table_good_w_rownames(MLcont_upper975, "MLcont_upper975_v8.txt")
write_table_good_w_rownames(phylosignal_table, "phylosignal_table_v8.txt")


# plot tip & node values to check them
txt1 = "ML"
txt2 = txt_suffix
for (i in 1:ncol(char_dtf))
	{
	recon_txt = paste("ACE technique: ", txt1, txt2, sep="")
	
	tmptipvals = char_dtf[,i]
	tmpnodevals = PICcont_means[,i]
	maxval = max(c(tmptipvals, tmpnodevals))
	texttxt = paste(recon_txt, "\nParameter: ", names(char_dtf)[i], "\nMaximum value = ", maxval, sep="") 

	plot_cont_vals_on_tree(chtr2, tmptipvals, tmpnodevals, texttxt)
	}


GLScont_means = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
GLScont_lower025 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
GLScont_upper975 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
for (i in 1:ncol(char_dtf))
	{
	GLScont = ace(char_dtf[,i], chtr2, type="continuous", method="GLS", CI=TRUE, corStruct = corBrownian(1, chtr2))
	GLScont_means[, i] = GLScont$ace
	GLScont_lower025[, i] = GLScont$CI95[, 1]
	GLScont_upper975[, i] = GLScont$CI95[, 2]
	}
GLScont_means = adf(GLScont_means)
names(GLScont_means) = names(char_dtf)
tree_dtf = prt(chtr2)
row.names(GLScont_means) = tree_dtf$node[ (length(chtr2$tip.label)+1): length(tree_dtf$node)]
GLScont_lower025 = adf(GLScont_lower025)
names(GLScont_lower025) = names(char_dtf)
row.names(GLScont_lower025) = row.names(GLScont_means)
GLScont_upper975 = adf(GLScont_upper975)
names(GLScont_upper975) = names(char_dtf)
row.names(GLScont_upper975) = row.names(GLScont_means)

write_table_good_w_rownames(GLScont_means, "GLScont_means_v8.txt")
write_table_good_w_rownames(GLScont_lower025, "GLScont_lower025_v8.txt")
write_table_good_w_rownames(GLScont_upper975, "GLScont_upper975_v8.txt")

# write the tree out to a table for Excel
tmptree_table = prt(chtr2)
tmptree_table$daughter_nds = as.character(tmptree_table$daughter_nds)
#tmptree_table = tmptree_table[, !(names(tmptree_table) == "daughter_nds")]
write_table_good(tmptree_table, "chtr2_v8.txt")

# plot tip & node values to check them
txt1 = "GLS"
txt2 = txt_suffix
for (i in 1:ncol(char_dtf))
	{
	recon_txt = paste("ACE technique: ", txt1, txt2, sep="")
	
	tmptipvals = char_dtf[,i]
	tmpnodevals = PICcont_means[,i]
	maxval = max(c(tmptipvals, tmpnodevals))
	texttxt = paste(recon_txt, "\nParameter: ", names(char_dtf)[i], "\nMaximum value = ", maxval, sep="") 

	plot_cont_vals_on_tree(chtr2, tmptipvals, tmpnodevals, texttxt)
	}


# Compare the estimates against each other with pairs plots
for (i in 1:ncol(char_dtf))
	{
	tmpcols = cbind(PICcont_means[,i], MLcont_means[,i], GLScont_means[,i])
	tmpcols = adf(tmpcols)
	names(tmpcols) = c("ACE_pic", "ACE_ML", "ACE_GLS")
	
	pairs.panels(tmpcols)
	
	titletxt = paste("param #", i, ": ", names(char_dtf)[i], sep="")
	title(titletxt, line=-1.35, outer=TRUE)
	}


#######################################################
# Reconstruct the ancestors -- discrete, plots
# discrete parameter characters
#######################################################
colnames_to_use = c(complex_params_names, clad_chars_names)
char_dtf = all_params[, colnames_to_use]
num_uniq = apply(char_dtf, 2, function(x) length(unique(x)))


# parsimony reconstruction
#
# https://stat.ethz.ch/pipermail/r-sig-phylo/2010-January/000565.html
# Hi Santiago,
# 
# Look at the ace function in the Ape package - to get a max parsimony  
# reconstruction set branch lengths =1 and run a maximum likelihood  
# estimation.  You can find further information here: http://www.r-phylo.org/wiki/HowTo/Ancestral_State_Reconstruction
# 

#x = dtf_to_phyDat(char_dtf)

# make a parsimony tree: 
pars_tr = chtr2
# set all branch lengths to 1
pars_tr$edge.length[!is.na(pars_tr$edge.length)] = 1
#ace(char_dtf[,1], pars_tr, type="discrete", method="ML", model="ER")


par(mfrow=c(1,2))
ACE_pars_estimates = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
for (i in 1:ncol(char_dtf))
	{
	(ACE_pars = ace(char_dtf[,i], pars_tr, type="discrete", method="ML", model="ER", ip=0.3))
	
	# convert to parsimony
	maxpar_node_vals = ACE_pars$lik.anc
	maxpar_node_maxes = apply(maxpar_node_vals, 1, max)

	maxpar_node_vals = adf(maxpar_node_vals)
	maxpar_node_MP = array(NA, dim=c(num_internal_nodes, 1))
	for (j in 1:nrow(maxpar_node_vals))
		{
		tmpTFs = maxpar_node_vals[j,] == maxpar_node_maxes[j]
		names(tmpTFs)
		colnum = which(tmpTFs)
		if (length(colnum) > 1)
			{
			tmplist = c()
			for (k in 1:length(colnum))
				{
				tmplist[k] = names(maxpar_node_vals)[colnum[k]]
				}
			tmpstr = paste(tmplist, collapse="&")
			maxpar_node_MP[j] = tmpstr
			} else {
			maxpar_node_MP[j] = names(maxpar_node_vals)[colnum]
			}
		}
		
	ACE_pars_estimates[, i] = maxpar_node_MP
	
	# plot results
	tree_ht = get_max_height_tree(chtr2)
	label_offset = 0.05*tree_ht
	plot(chtr2, show.node.label=FALSE, cex=0.9, no.margin = TRUE, x.lim=1.6*tree_ht, label.offset=label_offset)

	if (num_uniq[i] == 2)
		{
		colors = c("yellow", "blue")
		tmpcols = char_dtf[,i]
		tmpcols[tmpcols == 0] = colors[1]
		tmpcols[tmpcols == 1] = colors[2]
		}
	if (num_uniq[i] == 3)
		{
		colors = c("yellow", "blue", "red")
		tmpcols = char_dtf[,i]
		tmpcols[tmpcols == 0] = colors[1]
		tmpcols[tmpcols == 1] = colors[2]
		tmpcols[tmpcols == 2] = colors[3]
		}

	tiplabels(pch = 19, col = tmpcols, cex = 2)  #adj = 0, 
	legend(0, 3, legend=as.character(0:max(char_dtf[,i])), fill=colors)

	nodelabels(thermo=ACE_pars$lik.anc, cex=1, piecol=colors)
	
	titletxt = paste("ML, parsimony-like reconstruction of\n", names(char_dtf)[i], " character", sep="")
	title(titletxt, cex=0.5)
	
	
	}

ACE_pars_estimates = adf(ACE_pars_estimates)
names(ACE_pars_estimates) = names(char_dtf)
pars_tr_dtf = prt(pars_tr)
row.names(ACE_pars_estimates) = tree_dtf$node[ (length(pars_tr$tip.label)+1): length(pars_tr_dtf$node)]
#row.names(ACE_pars_estimates) = row.names(char_dtf)
(ACE_pars_estimates)

write_table_good(ACE_pars_estimates, "ACE_pars_estimates_v8.txt")


# Do ML reconstruction, with varying branch lengths (i.e. not branches = 1 like "parsimony")
par(mfrow=c(1,2))
ACE_ML_estimates = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
for (i in 1:ncol(char_dtf))
	{
	(ACE_ML = ace(char_dtf[,i], chtr2, type="discrete", method="ML", model="ER", ip=0.3))


	# convert to the single best ML estimate (plurality vote)
	maxML_node_vals = ACE_ML$lik.anc
	maxML_node_maxes = apply(maxML_node_vals, 1, max)

	maxML_node_vals = adf(maxML_node_vals)
	maxML_node_best = array(NA, dim=c(num_internal_nodes, 1))
	for (j in 1:nrow(maxML_node_vals))
		{
		tmpTFs = round(maxML_node_vals[j,], 3) == round(maxML_node_maxes[j], 3)
		names(tmpTFs)
		colnum = which(tmpTFs)
		if (length(colnum) > 1)
			{
			tmplist = c()
			for (k in 1:length(colnum))
				{
				tmplist[k] = names(maxML_node_vals)[colnum[k]]
				}
			tmpstr = paste(tmplist, collapse="&")
			maxML_node_best[j] = tmpstr
			} else {
			maxML_node_best[j] = names(maxML_node_vals)[colnum]
			}
		}
	
	#cbind(maxML_node_vals, maxML_node_maxes, maxML_node_best)
	
	ACE_ML_estimates[, i] = maxML_node_best

	# plot results	
	tree_ht = get_max_height_tree(chtr2)
	label_offset = 0.05*tree_ht
	plot(chtr2, show.node.label=FALSE, cex=0.9, no.margin = TRUE, x.lim=1.6*tree_ht, label.offset=label_offset)
	
	if (num_uniq[i] == 2)
		{
		colors = c("yellow", "blue")
		tmpcols = char_dtf[,i]
		tmpcols[tmpcols == 0] = colors[1]
		tmpcols[tmpcols == 1] = colors[2]
		}
	if (num_uniq[i] == 3)
		{
		colors = c("yellow", "blue", "red")
		tmpcols = char_dtf[,i]
		tmpcols[tmpcols == 0] = colors[1]
		tmpcols[tmpcols == 1] = colors[2]
		tmpcols[tmpcols == 2] = colors[3]
		}
	tiplabels(pch = 19, col = tmpcols, cex = 2)  #adj = 0, 
	legend(0, 3, legend=as.character(0:max(char_dtf[,i])), fill=colors)

	nodelabels(thermo=ACE_ML$lik.anc, cex=1, piecol=colors)
	
	titletxt = paste("ML, equal rates reconstruction of\n", names(char_dtf)[i], " character", sep="")
	title(titletxt, cex=0.5)
	}


ACE_ML_estimates = adf(ACE_ML_estimates)
names(ACE_ML_estimates) = names(char_dtf)
pars_tr_dtf = prt(pars_tr)
row.names(ACE_ML_estimates) = tree_dtf$node[ (length(pars_tr$tip.label)+1): length(pars_tr_dtf$node)]
#row.names(ACE_ML_estimates) = row.names(char_dtf)
(ACE_ML_estimates)

write_table_good(ACE_ML_estimates, "ACE_ML_estimates_v8.txt")


# Write out the trees we generates, also...
write.tree(tr, "tr.newick")
write.tree(tr2, "tr2.newick")
write.tree(chtr2, "chtr2.newick")