############################################################## # ============================================= # R and phylogenetics, starting from scratch # ============================================= # by Nick Matzke (and whoever else adds to this PhyloWiki page) # Copyright 2011-infinity # matzkeATberkeley.edu # January 2011 # # Please link/cite if you use this, email me if you have # thoughts/improvements/corrections. # ############################################################## # # Free to use/redistribute under: # Attribution-NonCommercial-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0) # # This program is free software; you can redistribute it and/or # modify it under the terms of the above license, linked here: # # http://creativecommons.org/licenses/by-nc-sa/3.0/ # # Summary: # # You are free: # # * to Share Ñ to copy, distribute and transmit the work # * to Remix Ñ to adapt the work # # Under the following conditions: # # * Attribution Ñ You must attribute the work in the manner # specified by the author or licensor (but not in any way that # suggests that they endorse you or your use of the work). # * Noncommercial Ñ You may not use this work for commercial purposes. # # * Share Alike Ñ If you alter, transform, or build upon this work, # you may distribute the resulting work only under the same or # similar license to this one. # # http://creativecommons.org/licenses/by-nc-sa/3.0/ # ################################################################### # # Assignment for Thursday, Feb. 3 # 1. Install R on your own computer # # 2. Take this test file and copy/paste in each of the commands # and see what they do. In my experience, these commands # cover about 80% of the commands you actually need do # use R. # # 3. Each time you see "#desc:", type YOUR quick description of # what the function is doing (and what options are being used in the # command, if needed). Hints: # # * "?functionname" brings up the help page for the command # * in the R GUI, "functionname(" brings up the proper syntax at # the bottom of the R GUI window # # 4. Turn in by PASTING your text file into email to me. # * Pretty please: start the SUBJECT of the email with: "IB200b_Lab3_Your_Name" # # Due next lab. # # Purpose: This may seem a little tedious, but the idea is that # (a) you will remember the basic commands better if you've been # forced to write them down # # (b) to get you in the habit of saving ALL your working R commands # in a text file/script for future reference and future use # # (c) producing well-commented code, so you can figure out what you # did when you return to a project six months later # ################################################################### # Stuff that is handy at the beginning of a script ###################################################### # R, close all devices/windows/ quartz windows graphics.off() # Turn off all BS stringsAsFactors silliness # R tends to read options(stringsAsFactors = FALSE) ###################################################### # Working directories: # One of the first things you want to do, usually, is # decide on your working directory # On my Mac, this is the directory wd = "/Users/nick/Desktop/_ib200a/ib200b_sp2011/lab03" # On a PC, you might have to specify paths like this: #wd = "c:\\Users\\nick\\Desktop\\_ib200a\\ib200b_sp2011\\lab03" #desc: set working directory setwd(wd) #desc: get working directory getwd() #desc: list the files in the directory list.files() # Sourcing scripts # Local sourcing: # source a script (windows machines require \\ instead of /) # sourcedir = '/_njm/' #source3 = '_genericR_v1.R' #source(paste(sourcedir, source3, sep="")) # You need the APE package to source genericR_v1.R scripts install.packages("ape") library(ape) # Remote sourcing: source("http://ib.berkeley.edu/courses/ib200b/scripts/_genericR_v1.R") #desc: concatentate to a list with c() student_names = c("Nick", "Tom", "Bob") # describe what the function "c" does: #desc: grade1 = c(37, 100, 60) grade2 = c(43, 80, 70) grade3 = c(100, 90, 100) grade1 grade2 grade3 print(grade3) #desc: column bind (cbind) temp_table = cbind(student_names, grade1, grade2, grade3) #desc: convert to data frame grade_data = as.data.frame(temp_table) # Try the same command this way. What happens differently? #desc: convert to data frame, print to screen (grade_data = as.data.frame(temp_table)) col_headers = c("names", "test1", "test2", "test3") #desc: add column heading names(grade_data) = col_headers print(grade_data) # change the column names back old_names = c("student_names", "grade1", "grade2", "grade3") names(grade_data) = old_names # R can be very annoying in certain situations, e.g. treating numbers as character data # What does as.numeric do? #desc: grade_data$grade1 = as.numeric(grade_data$grade1) grade_data$grade2 = as.numeric(grade_data$grade2) grade_data$grade3 = as.numeric(grade_data$grade3) (grade_data) # If you close R, reopen it, and re-run all of the commands above, # except leave out "options(stringsAsFactors = FALSE)", # you will see the bad thing which results. # I got so annoyed at R's tendency to convert data intended as # numeric (numbers) or text (character) into factors (categorical) # that I wrote a function specificially to check the "class" # of each column: # summary of an object's class, attributes, dimensions, length, # and class of each column, if it has columns summ(grade_data) # Print just the classes of the columns in a data frame cls.df(grade_data) # I also wrote functions to convert all columns classed as factors # into numbers or characters: dfnums_to_numeric(grade_data) # calculate means, etc. #desc: mean of one column mean(grade_data$grade1) #desc: apply the mean function over the rows, for just the numbers columns (2, 3, and 4) apply(grade_data[ , 2:4], 1, mean) # Other summary statistics mean(grade_data) # What caused the warning message in mean(grade_data)? #desc: sum(grade_data$grade1) median(grade_data$grade1) #desc: standard deviation apply(grade_data[ , 2:4], 1, sd) # store st. dev and multiply by 2 mean_values = apply(grade_data[ , 2:4], 1, mean) sd_values = apply(grade_data[ , 2:4], 1, sd) 2 * sd_values # print to screen even within a function: print(sd_values) #desc: row bind (rbind) grade_data2 = rbind(grade_data, c("means", mean_values), c("stds", sd_values)) ############################################# # install a package (only have to do once) ############################################# # Type this: odd(13) # What happened? #desc: # Now do this: install.packages("gtools") # gtools contains functions for odd/even (and many other things) # after a package is installed, you have to library() it to use its # functions during an R session library(gtools) # Now type this: odd(13) # For loops # Here, we are using for loops, if statements, and the gtools function "odd" # What does the code in this for loop do? #desc: for (i in 1:10) { if (odd(i) == TRUE) { print(paste(i, "is odd!", sep=" ")) } else { print("Blah!") } } #desc: paste("This", "is", "fun", sep=" ") # print can be annoying, use cat for (i in 1:10) { if (odd(i) == TRUE) { cat(paste(i, "is odd!", "\n", sep=" ")) } else { cat("Blah!\n" ) } } # How to make your own function # (These can be sources from a source script, which is # a .R file either on your computer get_square <- function(x) { output = x^2 return(output) } x = 4 #desc: (newval = get_square(x)) #desc: Write to tab-delimited text file fn = "grade_data.txt" write.table(grade_data, file=fn, quote=FALSE, sep=" ", row.names=TRUE, col.names=TRUE) # read it back in new_data = read.table(fn, header=TRUE, sep=" ", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE) # plots and stats #desc: plot(new_data$grade1, new_data$grade2) #desc: title("Hi Tom!") #desc: plot(new_data$grade1, new_data$grade2, xlab="scores for grade #1", ylab="scores for grade #2") #desc: lines(new_data$grade1, new_data$grade2) #desc: pairs(new_data[, 2:4]) #desc: cor(new_data[, 2:4]) # A function I stole from somewhere and put in genericR_v1: pairs_with_hist(new_data[, 2:4]) # CTRL-left or right to arrow through plots # help on any function ?mean ?std # once you are looking at the help page, go to the bottom & click index to see all the options for the package # or just e.g. ?gtools # search for text in help # (marginally useful) ??histogram # I like to google: ' "r-help" something something ' # ...since someone has always asked my question on the r-help listserv ############################################### # Basic crash course in APE # The R Package APE: Analysis of Phylogenetics # and Evolution # # Paradis's book on APE is linked from the # course website: # http://ib.berkeley.edu/courses/ib200b/IB200B_SyllabusHandouts.shtml # (for Feb. 3) ############################################### # Install APE install.packages("ape") # (This should install some other needed packages also) library(ape) # This is what a Newick string looks like: newick_str = "(((Humans, Chimps), Gorillas), Orangs);" tr = read.tree(text=newick_str) plot(tr) # What is the data class of "tr"? #desc: class(tr) # Is there any difference in the graphic produced by these two commands? #desc: plot(tr) plot.phylo(tr) # What is the difference in the result of these two help commands? #desc: ?plot ?plot.phylo # What are we adding to the tree and the plot of the tree, this time? #desc: newick_str = "(((Humans:6.0, Chimps:6.0):1.0, Gorillas:7.0):1.0, Orangs:8.0):1.0;" tr = read.tree(text=newick_str) plot(tr) # What are we adding to the tree and the plot of the tree, this time? #desc: newick_str = "(((Humans:6.0, Chimps:6.0)LCA_humans_chimps:1.0, Gorillas:7.0)LCA_w_gorillas:1.0, Orangs:8.0)LCA_w_orangs:1.0;" tr = read.tree(text=newick_str) plot(tr, show.node.label=TRUE) # More on Newick format, which, annoyingly, is sometimes inconsistent: # http://en.wikipedia.org/wiki/Newick_format # Have a look at how the tree is stored in R tr #desc: tr$tip.label #desc: tr$edge #desc: tr$edge.length #desc: tr$node.label # If you forget how to find these, you can use the "attributes" function #desc: attributes(tr) # Or my "summ" function summ(tr) # However, I got really tired of typing all that stuff, and # trying to remember which branch points to what node (and the nodes # are only implicitly encoded in R anyway, not explicitly!), so I wrote # the function "prt" to print the tree structure to screen all at once: # Local sourcing: #sourcedir = '/_njm/' #source3 = '_R_tree_functions_v1.R' #source(paste(sourcedir, source3, sep="")) # Remote sourcing: source("http://ib.berkeley.edu/courses/ib200b/scripts/_R_tree_functions_v1.R") prt(tr) # What information is given in each column of the output from "prt"? #node: #ord_ndname: #node_lvl: #node.type: #parent_br: #edge.length: #ancestor: #daughter_nds: #node_ht: #time_bp: #label: # Now plot the tree in different ways: # (CTRL-right or CTRL-left to flip between the trees in the graphics window) plot(tr, type="phylogram") #desc: plot(tr, type="phylogram", direction="rightwards") plot(tr, type="phylogram", direction="leftwards") plot(tr, type="phylogram", direction="upwards") plot(tr, type="phylogram", direction="downwards") #desc: plot(tr, type="cladogram") plot(tr, type="fan") plot(tr, type="unrooted") plot(tr, type="radial") #desc: plot(tr, type="unrooted", edge.width=5) plot(tr, type="unrooted", edge.width=5, edge.color="blue") plot(tr, type="unrooted", edge.width=5, edge.color="blue", lab4ut="horizontal") plot(tr, type="unrooted", edge.width=5, edge.color="blue", lab4ut="axial") # In R GUI, you can save any displayed tree to PDF, or do a screen capture etc. # you can also save a tree to PDF as follows: pdffn = "homstree.pdf" pdf(file=pdffn) plot(tr, type="unrooted", edge.width=5, edge.color="blue", lab4ut="axial") dev.off() # In Macs (and maybe PCs), this will open the PDF from R: cmdstr = paste("open ", pdffn, sep="") system(cmdstr) # How to save the tree as text files #desc: newick_fn = "homstree.newick" write.tree(tr, file=newick_fn) # moref is another function I wrote, to work vaguely like the UNIX "more" command #desc: moref(newick_fn) #desc: nexus_fn = "homstree.nexus" write.nexus(tr, file=nexus_fn) moref(nexus_fn) # To conclude the lab, I wanted to find, download, and display # a "tree of life". # # To do this, I went to the TreeBase search page: # http://www.treebase.org/treebase-web/search/studySearch.html # # ...and searched on studies with the title "tree of life" # # Annoyingly, the fairly famous tree from: # # Ciccarelli F.D. et al. (2006). "Toward automatic reconstruction of # a highly resolved tree of life." Science, 311:1283-1287. # http://www.sciencemag.org/content/311/5765/1283.abstract # # ...was not online, as far as I could tell. And a lot of these are the "turtle trees of life", etc. # Lame. But this one was a tree covering the root of known # cellular life. # # Caetano-anollŽs G. et al. (2002). "Evolved RNA secondary structure # and the rooting of the universal tree of life." Journal of # Molecular Evolution. # # Check S796 for this study, then click over to the "Trees" tab to get the # tree... # # http://www.phylowidget.org/full/?tree=%27http://www.treebase.org/treebase-web/tree_for_phylowidget/TB2:Tr3931%27 # # Or, download the tree from our website, here: # http://ib.berkeley.edu/courses/ib200b/labs/Caetano-anolles_2002_JME_ToL.newick # load the tree and play with it: newick_fn = "Caetano-anolles_2002_JME_ToL.newick" tree_of_life = read.tree(newick_fn) plot(tree_of_life, type="cladogram") plot(tree_of_life, type="phylogram") plot(tree_of_life, type="unrooted", lab4ut="axial") # aw, no branch lengths in TreeBase! Topology only! Lame! # Students ignore this: # Make the version of this code for handout (no answers) source3 = '_genericR_v1.R' source(paste(sourcedir, source3, sep="")) newfn = replace_each_matching_line("_Nicks_intro_to_R_v3.R", "#desc:", "#desc:") moref(newfn)