# READ THIS: You have done enough R that I don't have to give you # every single command this time. E.g., you now know how to # source scripts, install packages, set working directories, etc. # If I don't give you every single command to get set up, # figure it out! # Load necessary libraries; install phangorn if you don't have it # Also, source the generic and R_tree functions scripts library(ape) library(phangorn) sourcedir = '/_njm/' source3 = '_genericR_v1.R' source(paste(sourcedir, source3, sep="")) source3 = '_R_tree_functions_v1.R' source(paste(sourcedir, source3, sep="")) # This is a new script for running r8s source3 = 'r8s_functions_v1.R' source(paste(sourcedir, source3, sep="")) # All the scripts are kept online, here: # http://ib.berkeley.edu/courses/ib200b/scripts/ # e.g., source the new r8s script: source("http://ib.berkeley.edu/courses/ib200b/scripts/r8s_functions_v1.R") # set working directory # NOTE: MAKE SURE THE WD INCLUDES A LAST "/" CHARACTER!! wd = "/Users/nick/Desktop/_ib200a/ib200b_sp2011/lab07/" setwd(wd) ######################################################### # In this lab, we are going to use some data from Nick's projects # # The major point will be (a) BE CAREFUL and (b) THINK when you are dating. (Ha!) ######################################################### # First, we will look at some phylogenetic results on the complete # mtDNA sequence of 60-some horse breeds, and an outgroup, the wild ass. # get Nick's tree file # I made this tree with MrBayes trfn = "cons_Ecab_chrM_12May2010.fasta_nn.aln_prc.nexus_reroot.nexus" horsetr = read.nexus(trfn) plot(horsetr) # Question: Why are the horse sequences so clustered? # Question: Look at the tree. Does it "look" clocklike? # Let's make an ultrametric tree to compare. chronopl is the # R function we've been using # Note: we are scaling all trees to be of height 1 for now chronotr = chronopl(horsetr, lambda = 0, age.min = 1) plot(chronotr) axisPhylo() add.scale.bar() # Question: What happened to the tree? Is this good? # Go to the website and look at compare_Rchrono_r8s.pdf # Question: what looks to be the problem with chronopl # (I mean the problem you can see in the trees, not the # computational cause of it, which I don't know.) # Basically, chronopl doesn't work right at the moment. # A better implementation is in Michael Sanderson's program r8s. # You need to download r8s and place the executable file (it is a command-line program) # in your working directory. Then you have to tell run_r8s_1calib where the # r8s program is by specifying the r8s_path_tmp variable # (this is where I have mine) r8s_path_tmp = "/Applications/r8s" calibration_node_tip_specifiers = list("ass", "przewalski") nsites_tmp = 16331 tmpwd1 = wd r8s_nexus_fn1 = "horsetr_nexus_fn.nex" r8s_ultra_logfn = run_r8s_1calib(tr=horsetr, calibration_node_tip_specifiers, r8s_method="LF", addl_cmd="", calibration_age=1.0, nsites=nsites_tmp, tmpwd=tmpwd1, r8s_nexus_fn=r8s_nexus_fn1, r8s_path=r8s_path_tmp) r8s_tr_ultraLF = extract_tree_from_r8slog(logfn=r8s_ultra_logfn, delimiter=" = ") # Does this look like a reasonable result? plot(r8s_tr_ultraLF) axisPhylo() add.scale.bar() # This function checks if a tree is it ultrametric? is.ultrametric(r8s_tr_ultraLF) # run_r8s_1calib creates a NEXUS commands file for r8s. This can be # run in r8s via the command line without R at all (which is how # most people who aren't me would do it). # Take a look at it, just so you have seen a NEXUS commands file. # PAUP, MrBayes, r8s, and many other standard programs use such files moref("horsetr_nexus_fn.nex") # We can also run r8s's NPRS (non-parametric rate scaling): r8s_path_tmp = "/Applications/r8s" calibration_node_tip_specifiers = list("ass", "przewalski") nsites_tmp = 16331 tmpwd1 = wd r8s_nexus_fn1 = "horsetr_nexus_fn.nex" r8s_ultra_logfn = run_r8s_1calib(tr=horsetr, calibration_node_tip_specifiers, r8s_method="NPRS", addl_cmd="", calibration_age=1.0, nsites=nsites_tmp, tmpwd=tmpwd1, r8s_nexus_fn=r8s_nexus_fn1, r8s_path=r8s_path_tmp) r8s_tr_ultraNPRS = extract_tree_from_r8slog(logfn=r8s_ultra_logfn, delimiter=" = ") # Does this look like a reasonable result? plot(r8s_tr_ultraNPRS) axisPhylo() add.scale.bar() # Look at the tree. Believable? ######################################################### # Testing the molecular clock in R: note -- doesn't work ######################################################### # This is roughly how you would check for a molecular clock # in R...however, there seem to be bugs in optim.pml that produce # VERY weird branch lengths # get the DNA data datafn = "cons_Ecab_chrM_12May2010.fasta_nn.aln_prc.nexus.phy" mtDNA = read.phyDat(file=datafn, format="phylip", type="DNA") # set up a simple model. fitmodel_default = pml(tree=unroot(horsetr), data=mtDNA) # optimize the model on default options (this does not estimate base frequencies etc.) #fitmodel_simple = optim.pml(fitmodel_default) # look at the output, and at fitmodel_simple. E.g., try each of these (fitmodel_simple) class(fitmodel_simple) attributes(fitmodel_simple) summ(fitmodel_simple) # What has the function returned? # optimize all of the sequence evolution model parameters # and let the branch lengths be freely-estimated parameters # here, we are just letting base frequencies and branch lengths vary, # otherwise we are using a Jukes-Cantor mode fitmodel_brlens = optim.pml(fitmodel_default, optBf=FALSE, optQ=FALSE, optInv=FALSE, optGamma=FALSE, optEdge=TRUE, optRate=FALSE, maxit=100) # now fit the global rate of sequence evolution fitmodel_rate = optim.pml(fitmodel_brlens, optBf=FALSE, optQ=FALSE, optInv=FALSE, optGamma=FALSE, optEdge=FALSE, optRate=TRUE) # extract the tree fit_tree_no_ultra_tmp = fitmodel_rate$tree fit_tree_no_ultra = midpoint(fit_tree_no_ultra_tmp) plot(fit_tree_no_ultra) # Now get the likelihood for the ultrametric tree #r8s_tr_ultraLF # set up a simple model. fitmodel_default2 = pml(tree=unroot(r8s_tr_ultraLF), data=mtDNA) # don't optimize the branch lengths this time (we are fixing them to be ultrametric) fitmodel_brlens2 = optim.pml(fitmodel_default2, optBf=TRUE, optQ=TRUE, optInv=FALSE, optGamma=TRUE, optEdge=FALSE, optRate=FALSE) plot(midpoint(fitmodel_brlens2$tree)) # now fit the global rate of sequence evolution fitmodel_rate2 = optim.pml(fitmodel_brlens2, optBf=FALSE, optQ=FALSE, optInv=FALSE, optGamma=FALSE, optEdge=FALSE, optRate=TRUE) plot(midpoint(fitmodel_rate2$tree)) # extract the tree fit_tree_ultra_tmp = fitmodel_rate2$tree fit_tree_ultra = midpoint(fit_tree_ultra_tmp) plot(fit_tree_ultra) # Now, compare the likelihoods with a Likelihood Ratio Test # ...however, the optim.pml doesn't # seem to work very well, so let's not do this. ######################################################### # Question: What is the model? What is the likelihood of the data # under this model and tree?