# READ THIS: You have done enough R that I don't have to give you
# every single command this time.  E.g., you now know how to 
# source scripts, install packages, set working directories, etc.
# If I don't give you every single command to get set up,
# figure it out!


# Load necessary libraries; install phangorn if you don't have it
# Also, source the generic and R_tree functions scripts
library(ape)
library(phangorn)

sourcedir = '/_njm/'
source3 = '_genericR_v1.R'
source(paste(sourcedir, source3, sep=""))

source3 = '_R_tree_functions_v1.R'
source(paste(sourcedir, source3, sep=""))

# This is a new script for running r8s
source3 = 'r8s_functions_v1.R'
source(paste(sourcedir, source3, sep=""))

# All the scripts are kept online, here:
# http://ib.berkeley.edu/courses/ib200b/scripts/

# e.g., source the new r8s script:
source("http://ib.berkeley.edu/courses/ib200b/scripts/r8s_functions_v1.R")

# set working directory
# NOTE: MAKE SURE THE WD INCLUDES A LAST "/" CHARACTER!!
wd = "/Users/nick/Desktop/_ib200a/ib200b_sp2011/lab07/"
setwd(wd)


#########################################################
# In this lab, we are going to use some data from Nick's projects
#
# The major point will be (a) BE CAREFUL and (b) THINK when you are dating. (Ha!)
#########################################################

# First, we will look at some phylogenetic results on the complete
# mtDNA sequence of 60-some horse breeds, and an outgroup, the wild ass.

# get Nick's tree file
# I made this tree with MrBayes
trfn = "cons_Ecab_chrM_12May2010.fasta_nn.aln_prc.nexus_reroot.nexus"

horsetr = read.nexus(trfn)
plot(horsetr)

# Question: Why are the horse sequences so clustered?
# Question: Look at the tree.  Does it "look" clocklike?


# Let's make an ultrametric tree to compare.  chronopl is the 
# R function we've been using

# Note: we are scaling all trees to be of height 1 for now
chronotr = chronopl(horsetr, lambda = 0, age.min = 1)
plot(chronotr)
axisPhylo()
add.scale.bar()

# Question: What happened to the tree?  Is this good?

# Go to the website and look at compare_Rchrono_r8s.pdf
# Question: what looks to be the problem with chronopl
# (I mean the problem you can see in the trees, not the 
#  computational cause of it, which I don't know.)


# Basically, chronopl doesn't work right at the moment.  
# A better implementation is in Michael Sanderson's program r8s.

# You need to download r8s and place the executable file (it is a command-line program)
# in your working directory.  Then you have to tell run_r8s_1calib where the
# r8s program is by specifying the r8s_path_tmp variable

# (this is where I have mine)
r8s_path_tmp = "/Applications/r8s"
calibration_node_tip_specifiers = list("ass", "przewalski")
nsites_tmp = 16331
tmpwd1 = wd
r8s_nexus_fn1 = "horsetr_nexus_fn.nex"
r8s_ultra_logfn = run_r8s_1calib(tr=horsetr, calibration_node_tip_specifiers, r8s_method="LF", addl_cmd="", calibration_age=1.0, nsites=nsites_tmp, tmpwd=tmpwd1, r8s_nexus_fn=r8s_nexus_fn1, r8s_path=r8s_path_tmp)

r8s_tr_ultraLF = extract_tree_from_r8slog(logfn=r8s_ultra_logfn, delimiter=" = ")

# Does this look like a reasonable result?
plot(r8s_tr_ultraLF)
axisPhylo()
add.scale.bar()

# This function checks if a tree is it ultrametric?  
is.ultrametric(r8s_tr_ultraLF)


# run_r8s_1calib creates a NEXUS commands file for r8s.  This can be
# run in r8s via the command line without R at all (which is how
# most people who aren't me would do it).

# Take a look at it, just so you have seen a NEXUS commands file.
# PAUP, MrBayes, r8s, and many other standard programs use such files

moref("horsetr_nexus_fn.nex")


# We can also run r8s's NPRS (non-parametric rate scaling):

r8s_path_tmp = "/Applications/r8s"
calibration_node_tip_specifiers = list("ass", "przewalski")
nsites_tmp = 16331
tmpwd1 = wd
r8s_nexus_fn1 = "horsetr_nexus_fn.nex"
r8s_ultra_logfn = run_r8s_1calib(tr=horsetr, calibration_node_tip_specifiers, r8s_method="NPRS", addl_cmd="", calibration_age=1.0, nsites=nsites_tmp, tmpwd=tmpwd1, r8s_nexus_fn=r8s_nexus_fn1, r8s_path=r8s_path_tmp)

r8s_tr_ultraNPRS = extract_tree_from_r8slog(logfn=r8s_ultra_logfn, delimiter=" = ")

# Does this look like a reasonable result?
plot(r8s_tr_ultraNPRS)
axisPhylo()
add.scale.bar()


# Look at the tree.  Believable?


#########################################################
# Testing the molecular clock in R: note -- doesn't work
#########################################################

# This is roughly how you would check for a molecular clock
# in R...however, there seem to be bugs in optim.pml that produce
# VERY weird branch lengths

# get the DNA data
datafn = "cons_Ecab_chrM_12May2010.fasta_nn.aln_prc.nexus.phy"
mtDNA = read.phyDat(file=datafn, format="phylip", type="DNA")

# set up a simple model.
fitmodel_default = pml(tree=unroot(horsetr), data=mtDNA)

# optimize the model on default options (this does not estimate base frequencies etc.)
#fitmodel_simple = optim.pml(fitmodel_default)

# look at the output, and at fitmodel_simple.  E.g., try each of these
(fitmodel_simple)
class(fitmodel_simple)
attributes(fitmodel_simple)
summ(fitmodel_simple)

# What has the function returned?


# optimize all of the sequence evolution model parameters
# and let the branch lengths be freely-estimated parameters
# here, we are just letting base frequencies and branch lengths vary,
# otherwise we are using a Jukes-Cantor mode
fitmodel_brlens = optim.pml(fitmodel_default, optBf=FALSE, optQ=FALSE, optInv=FALSE, optGamma=FALSE, optEdge=TRUE, optRate=FALSE, maxit=100)


# now fit the global rate of sequence evolution
fitmodel_rate = optim.pml(fitmodel_brlens, optBf=FALSE, optQ=FALSE, optInv=FALSE, optGamma=FALSE, optEdge=FALSE, optRate=TRUE)

# extract the tree
fit_tree_no_ultra_tmp = fitmodel_rate$tree
fit_tree_no_ultra = midpoint(fit_tree_no_ultra_tmp)
plot(fit_tree_no_ultra)


# Now get the likelihood for the ultrametric tree

#r8s_tr_ultraLF
# set up a simple model.
fitmodel_default2 = pml(tree=unroot(r8s_tr_ultraLF), data=mtDNA)

# don't optimize the branch lengths this time (we are fixing them to be ultrametric)
fitmodel_brlens2 = optim.pml(fitmodel_default2, optBf=TRUE, optQ=TRUE, optInv=FALSE, optGamma=TRUE, optEdge=FALSE, optRate=FALSE)

plot(midpoint(fitmodel_brlens2$tree))

# now fit the global rate of sequence evolution
fitmodel_rate2 = optim.pml(fitmodel_brlens2, optBf=FALSE, optQ=FALSE, optInv=FALSE, optGamma=FALSE, optEdge=FALSE, optRate=TRUE)
plot(midpoint(fitmodel_rate2$tree))

# extract the tree
fit_tree_ultra_tmp = fitmodel_rate2$tree
fit_tree_ultra = midpoint(fit_tree_ultra_tmp)
plot(fit_tree_ultra)


# Now, compare the likelihoods with a Likelihood Ratio Test
# ...however, the optim.pml doesn't
# seem to work very well, so let's not do this.
#########################################################


# Question: What is the model?  What is the likelihood of the data 
# under this model and tree?