# ============================================= # genericR_v1.R: many useful utility functions # for R # # by Nick Matzke # Copyright 2011-infinity # matzkeATberkeley.edu # January 2011 # # Please link/cite if you use this, email me if you have # thoughts/improvements/corrections. # ############################################################## # # Free to use/redistribute under: # Attribution-NonCommercial-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0) # # This program is free software; you can redistribute it and/or # modify it under the terms of the above license, linked here: # # http://creativecommons.org/licenses/by-nc-sa/3.0/ # # Summary: # # You are free: # # * to Share — to copy, distribute and transmit the work # * to Remix — to adapt the work # # Under the following conditions: # # * Attribution — You must attribute the work in the manner # specified by the author or licensor (but not in any way that # suggests that they endorse you or your use of the work). # * Noncommercial — You may not use this work for commercial purposes. # # * Share Alike — If you alter, transform, or build upon this work, # you may distribute the resulting work only under the same or # similar license to this one. # # http://creativecommons.org/licenses/by-nc-sa/3.0/ # ################################################################### # Generic utility functions for R # Load with: #sourcedir = '/_njm/' #source3 = '_genericR_v1.R' #source(paste(sourcedir, source3, sep="")) # abbreviations adf <- function(x) { return(as.data.frame(x, row.names=NULL, stringsAsFactors=FALSE)) } # length, like in Python len <- function(x) { return(length(x)) } # the number of distinct values may be an issue, # let's count those count_uniq <- function(x) { length(unique(x)) } count_zeros <- function(x) { sum(x==0) } # Normalize values to a large positive number normalize_vals <- function(char_dtf, maxvals = NULL, baseval=0, normval=100) { # Normalize values to between 0 and 1 # set these vals to 1 if (is.null(maxvals)) { maxvals = apply(char_dtf, 2, max) } char_dtf_fraction_of_one = char_dtf for (rownum in 1:nrow(char_dtf)) { char_dtf_fraction_of_one[rownum, ] = char_dtf[rownum, ] / maxvals } # Multiply by 100 char_dtf_fraction_of_100 = char_dtf_fraction_of_one * 100 # Add 1000 char_dtf_1000 = char_dtf_fraction_of_100 + 1000 # to reverse, subtract 1000, divide by 100, multiply by maxval } # Summarize object summ <- function(x) { cat("\n") cat("SUMM(): PROVIDING OVERALL SUMMARY OF OBJECT...\n") cat("\nCLASS OF OBJECT:\n") print(class(x)) cat("\nDIMENSIONS OF OBJECT:\n") print(dim(x)) cat("\nLENGTH OF OBJECT:\n") print(length(x)) cat("\nATTRIBUTES OF OBJECT:\n") print(attributes(x)) cat("\nSUMMARY() OF OBJECT:\n") print(summary(x)) cat("\ncls.df(): print the classes of each column (if it's a data.frame):\n") cls.df(x, printout=TRUE) } print.default2 <- function (x, digits = NULL, quote = TRUE, na.print = NULL, print.gap = NULL, right = FALSE, max = NULL, useSource = TRUE, ...) { if (is.numeric(x)) { x <- as.numeric(sprintf("%7.3f", x)) } noOpt <- missing(digits) && missing(quote) && missing(na.print) && missing(print.gap) && missing(right) && missing(max) && missing(useSource) && length(list(...)) == 0L .Internal(print.default(x, digits, quote, na.print, print.gap, right, max, useSource, noOpt)) } prflag <- function(x, printflag=TRUE) { # A standard function to print (or not) certain variables, # based on a master printflag # This avoids having to comment in/out various code chunks # while debugging. if (printflag == TRUE) { # CAT instead of PRINT if it's a string or numeric if (is.character(x)) { cat(x, "\n", sep="") } if (is.numeric(x)) { cat(x, "\n", sep="") } else { print(x) } } else { pass="BLAH" } } # Backup file, if it exists fn_bkup <- function(tmpdir, fn) { # Check if the old log file exists files_list = list.files(path=tmpdir) if (fn %in% files_list) { cmdstr = paste("mv ", fn, " ", paste(fn, ".bkup", sep=""), sep="") system(cmdstr) } } printall <- function(dtf, chunksize_toprint = 50, printflag=TRUE) { # Print everything in a data frame, in chunks of e.g. 50 rows if (nrow(dtf) <= chunksize_toprint) { prflag(dtf) return(dtf) } rows_toprint = seq(1, nrow(dtf), chunksize_toprint) for (i in 1 : (length(rows_toprint)-1) ) { tmp_rows_toprint_start = rows_toprint[i] tmp_rows_toprint_end = rows_toprint[i+1] prflag(dtf[tmp_rows_toprint_start:tmp_rows_toprint_end, ]) } } # Get indices of all matches to a list findall <- function(what, inlist) { TFmatches = inlist == what indices = 1:length(inlist) matching_indices = indices[TFmatches] return(matching_indices) } # print to screen the header of a file headf <- function(fn) { lines = scan(file=fn, what="character", sep="\n") for (i in 1:5) { print(lines[i]) } } # print to screen the tail of a file tailf <- function(fn) { lines = scan(file=fn, what="character", sep="\n") numlines = length(lines) for (i in (numlines-5):numlines) { print(lines[i]) } } # remove punctuation remove_punct <- function(tempstr) { # Do find-replace to convert the names to all underscores, no spaces, no ' or " # spaces to _ tempstr = gsub(" ", "_", tempstr) # - to _ tempstr = gsub("-", "_", tempstr) # . to _ tempstr = gsub("\\.", "_", tempstr) # "'" to "" tempstr = gsub("'", "", tempstr) # '"' to '' tempstr = gsub('"', '', tempstr) return(tempstr) } # remove punctuation from a list remove_punct_list <- function(templist) { for (k in 1:length(templist)) { tempstr = templist[[k]] templist[[k]] = remove_punct(tempstr) } return(templist) } # split out: Taxon (subtaxon) split_out_subtaxon <- function(templist) { templist_col1 = templist templist_col2 = templist for (k in 1:length(templist)) { tempstr = templist[[k]] words = strsplit(tempstr, " \\(")[[1]] if (length(words) == 1) { templist_col1[[k]] = words[1] templist_col2[[k]] = "" } if (length(words) == 2) { templist_col1[[k]] = words[1] templist_col2[[k]] = gsub("\\)", "", words[2]) } } newcols = cbind(templist_col1, templist_col2) return(newcols) } # http://r.789695.n4.nabble.com/Remove-empty-list-from-list-td806706.html delete.NULLs <- function(x.list) { # delele null/empty entries in a list x.list[unlist(lapply(x.list, length) != 0)] } remove_len_zero <- function(tmparr) { newarr = tmparr[lapply(tmparr, nchar) != 0] return(newarr) } # match using 2 criteria match_items <- function(tmplist, crit1, crit2) { outlist = c() crit1_list = c() for (i in 1:length(tmplist)) { if (grepl(crit1, tmplist[i])) { crit1_list = c(crit1_list, tmplist[i]) } } crit2_list = c() for (i in 1:length(crit1_list)) { if (grepl(crit2, crit1_list[i])) { crit2_list = c(crit2_list, crit1_list[i]) } } return(crit2_list) } # match using crit1, but skip (leave out) if they have crit2 match_item_nomatch <- function(tmplist, crit1, crit2) { outlist = c() crit1_list = c() for (i in 1:length(tmplist)) { if (grepl(crit1, tmplist[i])) { crit1_list = c(crit1_list, tmplist[i]) } } crit2_list = c() for (i in 1:length(crit1_list)) { if (grepl(crit2, crit1_list[i])) { blah = "blah" } else { crit2_list = c(crit2_list, crit1_list[i]) } } return(crit2_list) } replace_str_file <- function(fn, oldstr, newstr) { # Read a text file into a list of strings lines = scan(fn, what="character", sep="\n") # Replace output date-calibrated parsimony trees file newlines = replace_str_lines(lines, oldstr, newstr) # Write the list of lines to a file write.table(newlines, file=fn, quote=FALSE, append=FALSE, sep="", row.names = FALSE, col.names=FALSE) cat("In: ", fn, ", '", oldstr, "' replaced with '", newstr, "'\n", sep="") return(fn) } replace_str_lines <- function(lines, oldstr, newstr) { # Replace output date-calibrated parsimony trees file lines = gsub(oldstr, newstr, lines) return(lines) } # replace each in a text file that matches, with newstr replace_each_matching_line <- function(fn, str_to_find, newstr)#, blankstr="___") { # Read a text file into a list of strings lines = scan(fn, what="character", blank.lines.skip=FALSE, sep="\n") for (i in 1:length(lines)) { # if (lines[i] == "") # { # lines[i] = blankstr # } # else # { if (grepl(str_to_find, lines[i])) { #print(str_to_find) #print(lines[i]) lines[i] = newstr cat("In: ", fn, ", line #", i, " replaced with '", newstr, "'\n", sep="") } # } } # Write the list of lines to a file newfn = paste(fn, "_new", sep="") write.table(lines, file=newfn, quote=FALSE, append=FALSE, sep="", row.names = FALSE, col.names=FALSE) return(newfn) } # Null lines that match null_lines <- function(list_of_lines, str_to_match) { list_of_lines2 = c() for (i in 1:length(list_of_lines)) { if (grepl(str_to_match, list_of_lines[i])) { # This, in effect, deletes the row containing the string blah = 0 } else { list_of_lines2 = c(list_of_lines2, list_of_lines[i]) } } return(list_of_lines2) } # Concatenate a list of files into one big file cat_files <- function(infiles_list, outfn) { new_lines = c() list_num_files = c() for (i in 1:length(infiles_list)) { # input file infn = infiles_list[i] lines = scan(infn, what="character", sep="\n") list_num_files = c(list_num_files, length(lines)) new_lines = c(new_lines, lines) } new_lines = write.table(new_lines, file=outfn, quote=FALSE, append=FALSE, sep="", row.names = FALSE, col.names=FALSE) return(list_num_files) } # print to screen the header of a file moref <- function(fn, printnotcat = FALSE) { lines = scan(file=fn, what="character", sep="\n") if (printnotcat == TRUE) { for (i in 1:length(lines)) { print(lines[i]) } } else { for (i in 1:length(lines)) { cat(paste(lines[i], "\n", sep="")) } } } print_table_sigs <- function(x, numsig=4, printout=FALSE) { new.table = signif_digits_df(dfnums_to_numeric(x), numsig, printout) cat("\n") print(new.table) cat("\n") return(new.table) } print_table_row <- function(x, numsig=4, printout=FALSE) { if (length(numsig) == 1) { new.table = signif(as.numeric(x), numsig) } else { tmplist = rep(NA, length(numsig)) for (i in 1:length(numsig)) { cmdstr = paste('tmplist[i] = sprintf("%1.', numsig[i], 'f", x[i])', sep="") eval(parse(text = cmdstr)) } } outstr = "" for (item in tmplist) { outstr = paste(outstr, trim(item), sep=" ") } print(paste(tmplist, sep=" "), quote=FALSE) #cat("\n") return(tmplist) } # Get the classes of the columns in a data frame cls.df <- function(dtf, printout=FALSE) { dtf_names = names(dtf) numcols = ncol(dtf) cls_col_list = c() for (i in 1:numcols) { # Get one column: cmdstr = paste("cls_col = class(dtf$", dtf_names[i], ")", sep="") eval(parse(text = cmdstr)) #cat(i, ": ", dtf_names[i], " = ", cls_col, "\n", sep="") cls_col_list[i] = cls_col } # row names colnum = 1:numcols dtf_classes = cbind(colnum, dtf_names, cls_col_list) dtf_classes = data.frame(dtf_classes, row.names=colnum) # Print the output if true if (printout) { cat("\n") cat("cls.df(dtf) reports: dataframe 'dtf' has ", nrow(dtf), " rows, ", numcols, " columns.\n", sep="") cat("...names() and classes() of each column below...\n", sep="") cat("\n") print(dtf_classes) cat("\n") } return(dtf_classes) } dfnums_to_numeric <- function(dtf, max_NAs=0.5, printout=FALSE) { dtf_classes = cls.df(dtf, printout=FALSE) dtf_names = names(dtf) numcols = ncol(dtf) cls_col_list = c() for (i in 1:numcols) { # Get one column: cmdstr = paste("cls_col = class(dtf$", dtf_names[i], ")", sep="") eval(parse(text = cmdstr)) #cat(i, ": ", dtf_names[i], " = ", cls_col, "\n", sep="") cls_col_list[i] = cls_col } for (i in 1:numcols) { if (cls_col_list[i] == "list") { next() } if (cls_col_list[i] != "numeric") { # Get one column, convert to numeric: cmdstr = paste("newcol = as.numeric(as.character(dtf$", dtf_names[i], "))", sep="") eval(parse(text = cmdstr)) # If it's less than 50% NAs (or max_NA NAs), then convert to numeric #print(newcol) #print(max_NAs * length(newcol)) #print(sum(is.na(newcol))) if (sum(is.na(newcol)) < (max_NAs * length(newcol))) { # Get the column, convert to numeric: # (if it's a factor, you have to convert to character, then to numeric) # (if it's a character, you can convert to character anyway...) #cmdstr = paste("dtf$", dtf_names[i], " = as.numeric(as.character(dtf$", dtf_names[i], "))", sep="") cmdstr = paste("dtf$", dtf_names[i], " = newcol", sep="") eval(parse(text = cmdstr)) } } } tmp_classes = cls.df(dtf) dtf_classes$newclasses = tmp_classes[,ncol(tmp_classes)] if(printout) { cat("\n") cat("dfnums_to_numeric(dtf, max_NAs=", max_NAs, ") reports: dataframe 'dtf_classes' has ", nrow(dtf_classes), " rows, ", ncol(dtf_classes), " columns.\n", sep="") cat("...names() and classes() of each column below...\n", sep="") cat("\n") print(dtf_classes) } return(dtf) } df_everything_to_char <- function(dtf, max_NAs=0.5) { dtf_names = names(dtf) numcols = ncol(dtf) cls_col_list = c() for (i in 1:numcols) { # Get one column, convert to character: cmdstr = paste("dtf$", dtf_names[i], " = as.character(dtf$", dtf_names[i], ")", sep="") eval(parse(text = cmdstr)) } return(dtf) } df_factors_to_char <- function(dtf, max_NAs=0.5) { dtf_classes = cls.df(dtf, printout=TRUE) dtf_names = names(dtf) numcols = ncol(dtf) cls_col_list = c() for (i in 1:numcols) { # Get one column: cmdstr = paste("cls_col = class(dtf$", dtf_names[i], ")", sep="") eval(parse(text = cmdstr)) #cat(i, ": ", dtf_names[i], " = ", cls_col, "\n", sep="") cls_col_list[i] = cls_col } for (i in 1:numcols) { if (cls_col_list[i] == "factor") { # Get one column, convert to character: cmdstr = paste("newcol = as.character(dtf$", dtf_names[i], ")", sep="") eval(parse(text = cmdstr)) cmdstr = paste("dtf$", dtf_names[i], " = newcol", sep="") eval(parse(text = cmdstr)) } } tmp_classes = cls.df(dtf) dtf_classes$newclasses = tmp_classes[,ncol(tmp_classes)] cat("\n") cat("dfnums_to_numeric(dtf, max_NAs=", max_NAs, ") reports: dataframe 'dtf_classes' has ", nrow(dtf_classes), " rows, ", ncol(dtf_classes), " columns.\n", sep="") cat("...names() and classes() of each column below...\n", sep="") cat("\n") print(dtf_classes) return(dtf) } # Remove factors from the character-like columns of a data.frame # (leaves the numbers as numbers) df_nonum_factors_to_char <- function(dtf, max_NAs=0.5) { dtf_classes = cls.df(dtf, printout=TRUE) dtf_names = names(dtf) numcols = ncol(dtf) cls_col_list = c() for (i in 1:numcols) { # Get one column: cmdstr = paste("cls_col = class(dtf$", dtf_names[i], ")", sep="") eval(parse(text = cmdstr)) #cat(i, ": ", dtf_names[i], " = ", cls_col, "\n", sep="") cls_col_list[i] = cls_col } for (i in 1:numcols) { if (cls_col_list[i] == "factor") { # Get one column, convert to character: cmdstr = paste("newcol = as.character(dtf$", dtf_names[i], ")", sep="") eval(parse(text = cmdstr)) cmdstr = paste("dtf$", dtf_names[i], " = newcol", sep="") eval(parse(text = cmdstr)) } } tmp_classes = cls.df(dtf) dtf_classes$newclasses = tmp_classes[,ncol(tmp_classes)] cat("\n") cat("dfnums_to_numeric(dtf, max_NAs=", max_NAs, ") reports: dataframe 'dtf_classes' has ", nrow(dtf_classes), " rows, ", ncol(dtf_classes), " columns.\n", sep="") cat("...names() and classes() of each column below...\n", sep="") cat("\n") print(dtf_classes) return(dtf) } moving_average <- function(xvals, yvals, byvals) { # moving averages intervals = seq(from=min(xvals), to=max(xvals), by=byvals) xmeans = c() ymeans = c() for (i in 1:(length(intervals)-1)) { starti = intervals[i] endi = intervals[i+1] indices1 = xvals >= starti indices2 = xvals < endi indices_in_range = (indices1 + indices2) > 1 vals = yvals[indices_in_range] if (length(vals) > 0) { xmean = mean(c(starti, endi)) ymean = mean(vals) xmeans = c(xmeans, xmean) ymeans = c(ymeans, ymean) } } tmp = cbind(xmeans, ymeans) moving_avgs = data.frame(tmp) return(moving_avgs) } extract_lines_startnum_to_flag <- function(lines, row_to_start, string_to_stop_at) { doneflag = 0 lines_to_keep = c() lines_to_keep_i = 0 for (i in 1:length(lines)) { line = lines[i] # skip the 1st two lines if (i < row_to_start) { blah = 0 } else { if (doneflag == 1) { blah = 0 } else { # if line starts with a dash, you are done... if (grepl(string_to_stop_at, line)) { #print("yo!") doneflag = 1 } else { lines_to_keep_i = lines_to_keep_i + 1 print(paste("Storing line #", lines_to_keep_i, sep="")) lines_to_keep[[lines_to_keep_i]] = line } } } } return(lines_to_keep) } # This will only extract 1 group between 1st startflag & 1st doneflag extract_lines_startstr_to_endstr <- function(lines, string_to_start_at, string_to_end_at, printflag=TRUE) { startflag = 0 doneflag = 0 lines_to_keep = c() lines_to_keep_i = 0 for (i in 1:length(lines)) { line = lines[i] if (grepl(string_to_start_at, line)) { startflag = 1 } if (grepl(string_to_end_at, line)) { doneflag = 1 } # end if done; if not, check startflag if (doneflag == 1) { blah = 0 } else { if (startflag == 1) { lines_to_keep_i = lines_to_keep_i + 1 if(printflag) { print(paste("Storing line #", lines_to_keep_i, sep="")) } lines_to_keep[[lines_to_keep_i]] = line } else { blah = 0 } } } return(lines_to_keep) } # Extract the strings that have a certain word at a certain position extract_lines_with_word_at_wordnum <- function(lines, word_to_find, wordnum=1, delimiter=" ", printflag=FALSE) { linenums_to_keep = c() for (i in 1:length(lines)) { tmpline = lines[i] if (tmpline == "") { next } words = strsplit(tmpline, split=delimiter)[[1]] #print(words) if (words[wordnum] == word_to_find) { #print(words[wordnum]) linenums_to_keep[length(linenums_to_keep)+1] = i if(printflag) { print(paste("Storing line #", i, sep="")) } } } #print(linenums_to_keep) newlines = lines[linenums_to_keep] return(newlines) } array_to_text <- function(inarray, spacer) { tmpstr = inarray[1] for (i in 2:length(inarray)) { tmpstr = paste(tmpstr, inarray[i], sep=spacer) } return(tmpstr) } minmax_pretty <- function(x) { minmax = c( min(pretty(x)), max(pretty(x)) ) return(minmax) } minmax_literal <- function(x) { minmax = c( min(x, na.rm=TRUE), max(x, na.rm=TRUE) ) return(minmax) } convert_nums_to_circular_lat <- function(nums, maxval=90) { mini = floor(min(nums)) maxi = ceiling(max(nums)) nums5 = rep(0, length(nums)) # go through 0-maxval, maxval-2*maxval, 2*maxval-3*maxval, 3*maxval-4*maxval # translates:0-maxval, maxval-0, 0- -maxval, -maxval-0 edgeval = maxval * 4 # get the nums in terms of edgevals nums1 = nums/edgeval # Don't change numbers between -maxval and maxval, but change the rest of them... ############################################## # degrees > 0 (positive) ############################################## posvals = nums > 0 # remove all cycles except the last nums2 = nums1[posvals] - floor(nums1[posvals]) # convert back to lat: nums3 = nums2 * edgeval nums4 = nums3 vals_to_change = (nums3 > maxval) + (nums3 <= 2*maxval) == 2 nums4[vals_to_change] = 2*maxval - nums3[vals_to_change] vals_to_change = (nums3 > 2*maxval) + (nums3 <= 3*maxval) == 2 nums4[vals_to_change] = -1*(nums3[vals_to_change] - 2*maxval) vals_to_change = (nums3 > 3*maxval) + (nums3 <= 4*maxval) == 2 nums4[vals_to_change] = 4*maxval - nums3[vals_to_change] nums5[posvals] = nums4 ############################################## # degrees < 0 (negative) ############################################## negvals = nums < 0 # remove all cycles except the last nums2 = nums1[negvals] - ceiling(nums1[negvals]) # convert back to lat: nums3 = nums2 * edgeval nums4 = nums3 vals_to_change = (nums3 < -maxval) + (nums3 >= -2*maxval) == 2 nums4[vals_to_change] = -2*maxval - nums3[vals_to_change] vals_to_change = (nums3 < -2*maxval) + (nums3 >= -3*maxval) == 2 nums4[vals_to_change] = -1*(nums3[vals_to_change] - -1*2*maxval) vals_to_change = (nums3 < -3*maxval) + (nums3 >= -4*maxval) == 2 nums4[vals_to_change] = -4*maxval - nums3[vals_to_change] nums5[negvals] = nums4 return(nums5) } convert_nums_to_long <- function(nums, maxval=180) { # get the nums in terms of 180s nums0 = nums nums1 = nums # For nums > 0 TFnums_gt0 = nums > 0 nums1[TFnums_gt0] = nums0[TFnums_gt0]/maxval # remove all cycles except the last nums1[TFnums_gt0] = nums1[TFnums_gt0] - floor(nums1[TFnums_gt0]) # convert back to lat: nums1[TFnums_gt0] = nums1[TFnums_gt0] * maxval # For nums < 0 TFnums_lt0 = nums < 0 nums1[TFnums_lt0] = nums0[TFnums_lt0]/maxval # remove all cycles except the last nums1[TFnums_lt0] = nums1[TFnums_lt0] - ceiling(nums1[TFnums_lt0]) # convert back to lat: nums1[TFnums_lt0] = nums1[TFnums_lt0] * maxval return(nums1) } # Basic correlation plot basic_xy_plot <- function(x, y, titlestart_txt="title\n", xlab=names(x), ylab=names(y), xlim=minmax_pretty(x), ylim=minmax_pretty(y)) { cortest_xy = cor.test(x, y, xlim=c(-1,1), ylim=c(-1,1)) lm_xy = lm(y ~ x) slope = round(lm_xy$coefficients[2], 2) intercept = round(lm_xy$coefficients[1], 2) corval = round(cortest_xy$estimate, 2) #pval = round(cortest_xy$p.value, 4) pval = cortest_xy$p.value titletxt = paste(titlestart_txt, "cor=", corval, "; slope=", slope, "; int=", intercept, "; p=", pval, sep="") plot(x, y, xlab=xlab, ylab=ylab, xlim=xlim, ylim=ylim) # make segment minx = min(x, na.rm=TRUE) miny = min(y, na.rm=TRUE) maxx = max(x, na.rm=TRUE) maxy = max(y, na.rm=TRUE) x0 = minx x1 = maxx y0 = slope*x0 + intercept y1 = slope*x1 + intercept segments(x0, y0, x1, y1) title(titletxt) } plot_basic_xy <- function(x, y, titlestart_txt="title\n", xlab=names(x), ylab=names(y), xlim=minmax_pretty(x), ylim=minmax_pretty(y)) { cortest_xy = cor.test(x, y, xlim=c(-1,1), ylim=c(-1,1)) lm_xy = lm(y ~ x) slope = round(lm_xy$coefficients[2], 2) intercept = round(lm_xy$coefficients[1], 2) corval = round(cortest_xy$estimate, 2) pval = cortest_xy$p.value titletxt = paste(titlestart_txt, "cor=", corval, "; slope=", slope, "; int=", intercept, "; p=", pval, sep="") plot(x, y, xlab=xlab, ylab=ylab, xlim=xlim, ylim=ylim) # make segment minx = min(x, na.rm=TRUE) miny = min(y, na.rm=TRUE) maxx = max(x, na.rm=TRUE) maxy = max(y, na.rm=TRUE) x0 = minx x1 = maxx y0 = slope*x0 + intercept y1 = slope*x1 + intercept segments(x0, y0, x1, y1) title(titletxt) } # Fancy correlation plots #http://www.personality-project.org/R/r.useful.html #some fancy graphics -- adapted from help.cor panel.cor.scale <- function(x, y, digits=2, prefix="", cex.cor) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r = (cor(x, y,use="pairwise")) txt <- format(c(r, 0.123456789), digits=digits)[1] txt <- paste(prefix, txt, sep="") if(missing(cex.cor)) cex <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex * abs(r)) } panel.cor <- function(x, y, digits=2, prefix="", cex.cor) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r = (cor(x, y,use="pairwise")) txt <- format(c(r, 0.123456789), digits=digits)[1] txt <- paste(prefix, txt, sep="") if(missing(cex.cor)) cex <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex ) } panel.hist <- function(x, ...) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(usr[1:2], 0, 1.5) ) h <- hist(x, plot = FALSE) breaks <- h$breaks; nB <- length(breaks) y <- h$counts; y <- y/max(y) rect(breaks[-nB], 0, breaks[-1], y, col="cyan", ...) } panel.points <- function(x, tmppch=20, ...) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(usr[1:2], 0, 1.5) ) par(pch=20) pts <- points(x, pch=".") } pairs.panels <- function (x,y,smooth=TRUE,scale=FALSE) { if (smooth ) { if (scale) { pairs(x, diag.panel=panel.hist,upper.panel=panel.cor.scale,lower.panel=panel.smooth) #pairs(x, diag.panel=panel.hist,upper.panel=panel.cor.scale,lower.panel=function(x) panel.points(x) ) #pairs(x, diag.panel=panel.hist, upper.panel=panel.cor.scale, lower.panel=function(x) points(x, pch=tmppch)) #pairs(x, diag.panel=panel.hist, upper.panel=panel.cor.scale, lower.panel=panel.points)#, lower.panel=panel.points(x, pch=tmppch)) } else { #pairs(x, diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=function(x) panel.points(x) ) pairs(x, diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=panel.smooth) #pairs(x, diag.panel=panel.hist, upper.panel=panel.cor, lower.panel=function(x) points(x, pch=tmppch)) #pairs(x, diag.panel=panel.hist, upper.panel=panel.cor, lower.panel=panel.points)#, lower.panel=panel.points(x, pch=tmppch)) } #else {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=panel.smooth) } else #smooth is not true { if (scale) { pairs(x, diag.panel=panel.hist,upper.panel=panel.cor.scale) } else { pairs(x, diag.panel=panel.hist,upper.panel=panel.cor) } } #end of else (smooth) } #end of function pairs.panels.lm <- function (x,y,tmplm=TRUE,scale=FALSE) { if (tmplm) { if (scale) { pairs(x, diag.panel=panel.hist, upper.panel=panel.cor.scale, lower.panel=panel.smooth, span=0.5, f=0.5) } else { pairs(x, diag.panel=panel.hist, upper.panel=panel.cor, lower.panel=panel.smooth) } #else {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=panel.smooth) } else #smooth is not true { if (scale) { pairs(x, diag.panel=panel.hist,upper.panel=panel.cor.scale) } else { pairs(x, diag.panel=panel.hist,upper.panel=panel.cor) } } #end of else (smooth) } #end of function #function to replace upper triangle of matrix with unattenuated correlations, and the diagonal with reliabilities #input is a correlation matrix and a vector of reliabilities correct.cor <- function(x,y) { n=dim(x)[1] { x[1,1] <- y[1,1] for (i in 2:n) { x[i,i] <- y[1,i] #put reliabilities on the diagonal k=i-1 for (j in 1:k) { x[j,i] <- x[j,i]/sqrt(y[1,i]*y[1,j]) } #fix the upper triangular part of the matrix } return(x) }} #difference of dependent (paired) correlations (following Steiger,J., 1980, Tests for comparing elements of a correlation matrix. Psychological Bulletin, 87, 245-251) paired.r <- function(xy,xz,yz,n) { diff <- xy-xz determin=1-xy*xy - xz*xz - yz*yz + 2*xy*xz*yz av=(xy+xz)/2 cube= (1-yz)*(1-yz)*(1-yz) t2 = diff * sqrt((n-1)*(1+yz)/(((2*(n-1)/(n-3))*determin+av*av*cube))) return(t2) } fisherz <- function(rho) {0.5*log((1+rho)/(1-rho)) } #converts r to z ################################################### # LaTeX-related functions ################################################### # # to run tex2div library(tools) # You also need a well-set-up .Rprofile file to make it work (see fun_w_Rprofile.R in /njm probably) ## Set working directory ##wd = "/Users/nick/Desktop/_IB286_Marshall_paleobio/" #wd = "/Users/nick/Desktop/_2010-10-12_work/_Marshall_PBDB/_IB286_Marshall_paleobio/" #setwd(wd) runlatex <- function(fn, wd=getwd()) { #swwds = c("/usr/local/texlive/2009basic/texmf-dist/tex/latex/base/", wd, "/usr/texbin/") # Setup directories for LaTex files and the working directories swwds = c("/usr/local/texlive/2008/texmf-dist/tex/latex/base/", wd, "/usr/texbin/") # These commands build and open the Sweave tex and PDF files... # permfn = "example-2.Snw" # permfn = "hw2_v2.Snw" permfn = fn Sweave(permfn) texfn = sub(".Snw", ".tex", permfn) texi2dvi(texfn, pdf="TRUE", quiet=FALSE, texinputs=swwds) pdffn = sub(".Snw", ".pdf", permfn) cmdstr = paste("open ", pdffn, sep="") system(cmdstr) return(cmdstr) } ############################################################## # TreeRogue 0.1: TreeThief-like tree grabbing using x,y # coordinates digitized from an image of a phylogenetic tree. ############################################################## # GOAL: to process x, y coordinates into a Newick-format tree ############################################################## # by Nick Matzke # Copyright 2010 # matzke@berkeley.edu # 10/27/2010 ############################################################## # # Free to use/redistribute under GNU General Public License, version 2 # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # # http://www.gnu.org/licenses/old-licenses/gpl-2.0.html # ################################################################### # # #=================================================== # Run with these commands # =================================================== # library(ape) # # put the text files (at bottom) into your working directory # wd = "/Users/nick/Desktop/__projects/2010-11-01_cone_snails/" # setwd(wd) # # xy2 = treerogue_read_files() # xy3 = treerogue_associate_branch_bottoms_with_nodes(xy2) # tr = build_tree_using_corners(xy3) # plot(tr) # #################################################################### # # NOTES # *Heavily* modified from a very limited script posted here by # bbolker@gmail.com: # https://stat.ethz.ch/pipermail/r-sig-phylo/2008-November/000173.html # # Background: I worked up this script after getting frustrated at # (1) the failure of the famous "TreeThief" to work on any modern machine; # (2) my failure to get the newer "TreeSnatcher" to work on Mac OS X 10.4.11 # (some weirdness about Java and X11, as far as I can tell), and # (3) no other good options. # # Summary: This script takes in x,y coordinates (with the lower left as the origin) # of nodes, tips, and branch bottoms ("corners"), and builds a tree out of it. # # It assumes: # (1) Your tree is horizontal left-to-right, with the tips on the right # (2) Your tree is a "square" tree (i.e. no diagonal/curved branches) # # I captured my x,y coordinates using GraphClick 3.0, available for $8 at: # http://www.arizona-software.ch/graphclick/ # # (there is a free trial download, but only lets you export 10 coordinates at a # time, so it is pointless) # # REQUIRED INPUTS: # (for each, digitize the relevant points in GraphClick, and # File-->Export to a text file): # # (Note: all text files should have a header line) # # 1. A tab-delimited text file with x and y for each internal node # # 2. A tab-delimited text file with x and y for each tip # # 2a. A text file with tip names, in order from top-to-bottom # # 3. A tab-delimited text file with x and y for each tip for each "corner" # (i.e., the bottom of each branch). # # 4. For now, do NOT digitize the bottom of the root of the tree, if the # image you are digitizing has one. You could add the root length later # manually, if you like. # # 5. The tree must be fully dichotomous (if the image you are digitizing is not, # you can "fake it" by resolving polytomies by clicking digitization points # to, in effect, manually resolve these polytomies with very short branches. # Note that you will have to add a corner for each internal node you add (!). # # The R APE package can collapse short branches to polytomies later, if you like. # # Trees to not have to be ultrametric, and digitization does not have to be # exact -- the script will attempt to match the most likely branch bottoms # to the nodes (a graphic is produced by R so that you can check the results # and tinker if necessary). # # Requires the APE library. # ############################################################# library(ape) # Assumes default filenames treerogue_read_files <- function() { internal = read_table_good("internal.txt") tips = read_table_good("tips.txt") tipnames = read_table_good("tipnames.txt") corners = read_table_good("corners.txt") # sort the tips from top to bottom in y tips = tips[order(tips$y, decreasing = TRUE), ] # sort the internals from left to right in x internal = internal[order(internal$x, decreasing=FALSE), ] if (nrow(tips) != nrow(tipnames)) { print("ERROR: the number of tips must equal the length of the tipnames!") print(paste("Instead, nrow(tipnames) =", nrow(tipnames), "and nrow(tips) =", nrow(tips), sep=" ")) } if ((nrow(tips)-1) != nrow(internal)) { print("ERROR: the number of tips-1 must equal the number of the internal nodes!") print(paste("Instead, nrow(tips) =", nrow(tips), "and nrow(internal) =", nrow(internal), sep=" ")) } nodetypes = c(rep("tip", nrow(tipnames)), rep("internal", nrow(internal))) nodenames = unlist(c(tipnames, rep("", nrow(internal)))) xy = rbind(tips, internal) xy2 = cbind(xy, nodetypes, nodenames) xy2 = as.data.frame(xy2) names(xy2) = c("x", "y", "nodetypes", "nodenames") xy2 = df_nonum_factors_to_char(xy2, max_NAs=0.5) xy2$nodetypes[xy2$x == min(xy2$x)] = "root" if (nrow(corners) != (nrow(xy2)-1)) { print("ERROR: the number of corners must equal the number of nodes-1 !") print(paste("Instead, length(nodes) =", nrow(xy2), "and nrow(corners) =", nrow(corners), sep=" ")) } # sort file so that the tips are first # tip nodes in order from top to bottom: xytips = xy[xy$tipname != "", ] tips_in_order = xy2$tipname[xy2$tipname != ""] return(xy2) } treerogue_associate_branch_bottoms_with_nodes <- function(xy2) { # Load the coordinates of the corners corners = read_table_good("corners.txt") bots = corners # Get xy data xx = xy2$x yy = xy2$y dtf = associate_branch_bottoms_with_nodes(xx, yy, bots) xy3 = cbind(xy2, dtf$chosen_node_bottoms_xx, dtf$chosen_node_bottoms_yy) names(xy3) = c(names(xy2), "cx", "cy") write.table(xy3, "linked_corners.txt", quote=FALSE, sep=" ", row.names = FALSE, col.names = TRUE) return(xy3) } # Associate branch bottom coordinates with nodes, and plot the results; # The user may then edit the output associations if they so desire. associate_branch_bottoms_with_nodes <- function(xx, yy, bots) { # There should be one less branch bottom than there are internal nodes # (because the root node (should) have no digitized "corner" beneath it) nodes = 1:length(xx) if (length(nodes) != nrow(bots) +1) { print("ERROR: the number of corners must equal the number of nodes-1 !") print(paste("Instead, length(nodes) =", length(nodes), "and nrow(bots) =", nrow(bots), sep=" ")) } else { # OK, find bottom of branch to go with each top of branch # an array saying which branches have a bottom node_with_no_bottom = rep(TRUE, length(nodes)) # these are the remaining branch bottoms that have not been associated yet bots_with_no_top = rep(TRUE, nrow(bots)) bxx = bots$x byy = bots$y botsnodes = 1:nrow(bots) # Empty values to hold nodes chosen_node_bottoms_xx = rep(NA, length(xx)) chosen_node_bottoms_yy = rep(NA, length(yy)) i=0 while (sum(node_with_no_bottom) > 1) { i=i+1 #print(i) # look for branch bottom coordinates in a narrow window to the left of the node # basically (a) smallest slope and (b) closest distance in x ## find next node to include (the rightmost internal node) maxvals = xx * node_with_no_bottom nextnode <- which( maxvals == max(maxvals, na.rm=TRUE) )[1] #################################### # Find the best matching branch bottom/corner for each node #################################### # This is trial-and-error, you may have to plink to find a function # that works. # That, or do manual edits to the tree later... #################################### ydist <- byy - yy[nextnode] xdist <- bxx - xx[nextnode] # Rank of the y distances rank_ydist = rank(abs(ydist)) # calculate the slops xyslopes <- abs(ydist/xdist) # the best ancestor will have a low slope to the branch bottom, and a short negative distance in x xdist_neg = xdist xdist_neg[xdist > 0] = 0 xdist_neg[xdist < 0] = -1 * xdist_neg[xdist < 0] # normalize to units of minimum absolute distance min_dist = (min(abs(xdist[xdist!=0]), na.rm=TRUE)) xdist_neg_norm = (xdist_neg / min_dist) # short positive distances are less good (half as good) than short negative distances xdist_pos = xdist xdist_pos[xdist < 0] = 0 xdist_pos[xdist > 0] = xdist_pos[xdist > 0] xdist_pos_norm = (xdist_pos / min_dist) * 100 rank_xdist = rank_ydist rank_xdist[xdist <= 0] = 1 rank_xdist[xdist > 0] = 10000000 ########################### # Plink here especially... ########################### rank_slope = (xyslopes^2) final_rank = rank_ydist * abs(ydist) + xyslopes^0.5 * xdist_neg_norm + xdist_pos_norm ########################### branch_bot_fits = final_rank best_fit = which(branch_bot_fits == min(branch_bot_fits, na.rm=TRUE))[1] #bottom_to_add = botsnodes[bots_with_no_top][best_fit] bottom_to_add = botsnodes[best_fit] chosen_node_bottoms_xx[nextnode] = bxx[bottom_to_add] chosen_node_bottoms_yy[nextnode] = byy[bottom_to_add] bxx = bxx[-bottom_to_add] byy = byy[-bottom_to_add] # remove the node from the list needing branch bottoms node_with_no_bottom[nextnode] = FALSE bots_with_no_top[bottom_to_add] = FALSE } } tmpdata = cbind(nodes, xx, yy, chosen_node_bottoms_xx, chosen_node_bottoms_yy) #print(tmpdata) dtf = as.data.frame(tmpdata) plot(dtf$y, dtf$chosen_node_bottoms_yy) plot(c(xx, bots$x), c(yy, bots$y), pch="") points(xx, yy, pch="n") points(bots$x, bots$y, pch="b") title("Use this plot to check if branch bottoms match nodes") segments(xx, yy, chosen_node_bottoms_xx, chosen_node_bottoms_yy) plot(c(xx, bots$x), c(yy, bots$y), pch="") text(xx, yy, label=paste("n", 1:length(xx), sep="")) text(bots$x, bots$y, labe=paste("b", 1:nrow(bots), sep="")) title("This plot has node numbers.") segments(xx, yy, chosen_node_bottoms_xx, chosen_node_bottoms_yy) return(dtf) } build_tree_using_corners <- function(xy3) { # define the tip.labels tip.labels = xy3$nodenames tip.labels = tip.labels[tip.labels != ""] if (!missing(tip.labels)) { ntips <- length(tip.labels) } xx = xy3$x yy = xy3$y cx = xy3$cx cy = xy3$cy nodes <- 1:length(xx) is.tip <- nodes <= ntips # keep track of the nodes which are unlinked unlinked_nodes = rep(TRUE, length(nodes)) # Checks (kinda) if internal nodes are ordered from left-to-right if (which.min(xx) != ntips+1) { ## print("ERROR: reorder nodes the way ape/phylo expects! (tips first, then internals in order from left-to-right.") #yy[internal] <- rev(yy[!is.tip])[order(xx[!is.tip])] #xx[internal] <- rev(yy[!is.tip])[order(xx[!is.tip])] } edges <- matrix(nrow=0,ncol=2) edge.length <- numeric(0) nnode <- length(xx)-ntips while (sum(unlinked_nodes) > 1) { ## find next node to include (the rightmost internal node) nextnode <- which(!is.tip & xx==max(xx[!is.tip]))[1] ## find daughters # get the distance (in y) to all of the other corners ydist <- yy-yy[nextnode] xdist <- xx-xx[nextnode] # Check if it's the root if ( is.na(cy[nextnode]) ) { cy[nextnode] = yy[nextnode] cx[nextnode] = 0 # leftmost coordinate must be 0! } cydist <- yy-cy[nextnode] cxdist <- xx-cx[nextnode] # find the TWO tips closest to this internal node, which are RIGHT of this node # this only finds the CLOSEST tip in Y, we want the TWO closest tips!! #daughters <- which(is.tip & dist==min(dist[is.tip])) # rank the ydistances in the y direction rank_of_ydists = order(cydist) # rank the xdistances in the x direction rank_of_xdists = order(cxdist) # get the node numbers in order; delete from this list as they are eliminated nodes_to_keep = nodes # daughter nodes must be to the right (in x) of the nextnode # (and they must be unlinked) nodes_to_keep = nodes_to_keep[unlinked_nodes][xdist[unlinked_nodes] > 0] # daughter nodes should be the two closest corners to nextnode (in y, mostly AND x) absolute_dist_from_node = 100*abs(cydist[nodes_to_keep]) + 1*abs(cxdist[nodes_to_keep]) # sort the distances absolute_dist_from_node_sorted = sort(absolute_dist_from_node) # take the 2nd smallest absolute distance y_abs_dist_tokeep = absolute_dist_from_node_sorted[2] nodes_to_keep_final = nodes_to_keep[absolute_dist_from_node <= y_abs_dist_tokeep] print(paste("Linking: #", nodes_to_keep_final[1], " ", tip.labels[nodes_to_keep_final[1]], ", #", nodes_to_keep_final[2], " ", tip.labels[nodes_to_keep_final[2]], sep="")) #daughters <- which(is.tip & dist==min(dist[is.tip])) daughters = nodes_to_keep_final ## be careful with numeric fuzz? edges <- rbind(edges, nodes[c(nextnode,daughters[1])], nodes[c(nextnode,daughters[2])]) edge.length <- c(edge.length,xx[daughters]-xx[nextnode]) # add nextnode to the list of tips (which are not available nodes for the nextnode) is.tip[nextnode] <- TRUE # remove the daughters & coordinates from the list of available nodes unlinked_nodes[daughters] = FALSE print(sum(unlinked_nodes)) #xx <- xx[-daughters] #yy <- yy[-daughters] # remove the daughters from the list of possible nodes to link #unlinked_nodes #is.tip <- is.tip[-daughters] #nodes <- nodes[-daughters] } tr <- list(tip.labels=tip.labels, edge=edges, edge.length=edge.length, Nnode=nnode) class(tr) <- "phylo" tr <- reorder(tr) tr$tip.labels = tip.labels return(tr) } # This attempts to build the tree without corners. # This works OK for young nodes, but gets increasingly bad # with older nodes, unless you have a perfectly symmetrical tree. build_tree_without_using_corners <- function(xx, yy, tip.labels, poly=numeric(0), debug=FALSE) { # define the tips if (!missing(tip.labels)) { ntips <- length(tip.labels) } nodes <- 1:length(xx) is.tip <- nodes <= ntips # keep track of the nodes which are unlinked unlinked_nodes = rep(TRUE, length(nodes)) if (which.min(xx) != ntips+1) { ## print("ERROR: reorder nodes the way ape/phylo expects! (tips first, then internals in order from left-to-right.") #yy[internal] <- rev(yy[!is.tip])[order(xx[!is.tip])] #xx[internal] <- rev(yy[!is.tip])[order(xx[!is.tip])] } edges <- matrix(nrow=0,ncol=2) edge.length <- numeric(0) nnode <- length(xx)-ntips while (sum(unlinked_nodes) > 1) { ## find next node to include (the rightmost internal node) nextnode <- which(!is.tip & xx==max(xx[!is.tip]))[1] ## find daughters # get the distance (in y) to all of the other nodes ydist <- yy-yy[nextnode] xdist <- xx-xx[nextnode] # find the TWO tips closest to this internal node, which are RIGHT of this node # this only finds the CLOSEST tip in Y, we want the TWO closest tips!! #daughters <- which(is.tip & dist==min(dist[is.tip])) # rank the ydistances in the y direction rank_of_ydists = order(ydist) # rank the xdistances in the x direction rank_of_xdists = order(xdist) # get the node numbers in order; delete from this list as they are eliminated nodes_to_keep = nodes # daughter nodes must be to the right (in x) of the nextnode # (and they must be unlinked) nodes_to_keep = nodes_to_keep[unlinked_nodes][xdist[unlinked_nodes] > 0] # daughter nodes should be the two closest to nextnode (in y) absolute_dist_from_node = abs(ydist[nodes_to_keep]) # sort the distances absolute_dist_from_node_sorted = sort(absolute_dist_from_node) # take the 2nd smallest absolute distance y_abs_dist_tokeep = absolute_dist_from_node_sorted[2] nodes_to_keep_final = nodes_to_keep[absolute_dist_from_node <= y_abs_dist_tokeep] print(paste("Linking: #", nodes_to_keep_final[1], " ", tip.labels[nodes_to_keep_final[1]], ", #", nodes_to_keep_final[2], " ", tip.labels[nodes_to_keep_final[2]], sep="")) #daughters <- which(is.tip & dist==min(dist[is.tip])) daughters = nodes_to_keep_final ## be careful with numeric fuzz? edges <- rbind(edges, nodes[c(nextnode,daughters[1])], nodes[c(nextnode,daughters[2])]) edge.length <- c(edge.length,xx[daughters]-xx[nextnode]) # add nextnode to the list of tips (which are not available nodes for the nextnode) is.tip[nextnode] <- TRUE # remove the daughters & coordinates from the list of available nodes unlinked_nodes[daughters] = FALSE #xx <- xx[-daughters] #yy <- yy[-daughters] # remove the daughters from the list of possible nodes to link unlinked_nodes #is.tip <- is.tip[-daughters] #nodes <- nodes[-daughters] } zz <- list(tip.labels=tip.labels, edge=edges, edge.length=edge.length, Nnode=nnode) class(zz) <- "phylo" zz <- reorder(zz) zz$tip.labels = tip.labels return(zz) } treethief <- function(xypts) { ## from ?plot.tree: cat("(((Strix_aluco:4.2,Asio_otus:4.2):3.1,", "Athene_noctua:7.3):6.3,Tyto_alba:13.5);", file = "ex.tre", sep = "\n") tree.owls <- read.tree("ex.tre") #plot(tree.owls) unlink("ex.tre") # delete the file "ex.tre" #plot(tree.owls) xy <- get("last_plot.phylo",envir=.PlotPhyloEnv) xx <- xy$xx yy <- xy$yy points(xx,yy,col="white",pch=16,cex=2) text(xx,yy,col=2,1:length(xx)) newtree <- build.tree(xx,yy,tree.owls$tip.label) data(bird.orders) plot(bird.orders,show.node.label=TRUE) xy <- get("last_plot.phylo",envir=.PlotPhyloEnv) points(xx,yy,col="white",pch=16,cex=2) text(xx,yy,col=2,1:length(xx)) xx <- xy$xx yy <- xy$yy newtree2 <- build.tree(xx,yy,bird.orders$tip.label) } #################################### # charnames_to_nexus_text #################################### # Take a list of names, add numbers & quotes, to produce NEXUS character matrix input, e.g.: # # 1 'it_is1', 2 'it_is1_blank', 3 'it_is1a', 4 'it_is1b', 5 'it_is1c', 6 not1, 7 'not1_blank', 8 not1a, 9 not1b, 10 the1, 11 'the1_blank', 12 intstr1, 13 'intstr1_blank', 14 intstr1b, 15 intstr1c, 16 intstr2, 17 'intstr2_blank', 18 intstr2b, 19 intstr2c, 20 of1, 21 the2, 22 'species1_blank', 23 species1a, 24 species1b, 25 that1, 26 'that1_blank', 27 survive1, 28 'survive1_blank', 29 survive1a, 30 survive1b, 31 of1, 32 'it_is2', 33 not2, 34 'not2_blank', 35 the3, 36 species2, 37 species2a, 38 species2b, 39 that2, 40 'that2_blank', 41 that2a, 42 survives2a, 43 'survives2_blank', 44 but1, 45 'but1_blank', 46 but1a, 47 rather, 48 'rather_blank', 49 rathera, 50 'it_is3_blank', 51 'it_is3a', 52 'it_is3b', 53 'it_is3c', 54 the4, 55 'the4_blank', 56 species3, 57 'species3_blank', 58 species3a, 59 that3, 60 'that3_blank', 61 survives3, 62 is1, 63 'is1_blank', 64 the5, 65 'the5_blank', 66 one, 67 that4, 68 'that4_blank', 69 is2, 70 'able_best', 71 'able_best_blank', 72 'able_best_a', 73 'able_best_b', 74 to1, 75 adapt1, 76 'adapt1_blank', 77 adapt1a, 78 adapt1b, 79 to2, 80 'to2_blank', 81 and1, 82 'and1_blank', 83 to3, 84 adjust1, 85 'adjust1_blank', 86 adjust1a, 87 adjust1b, 88 best1, 89 'best1_blank', 90 to4, 91 the6, 92 'the6_blank', 93 changing, 94 'changing_blank', 95 'long_end', 96 'long_end_blank', 97 'long_enda', 98 'long_endb', 99 'long_endc', 100 'long_endd', 101 comma1, 102 comma2, 103 comma3, 104 comma4, 105 'intel_before_strong', 106 'General_Field', 107 paraphrase, 108 'Darwin_attribution', 109 'Darwin_subsource_simp', 110 'if_evo_theory', 111 'if_Darwins_theory', 112 'if_Origin', 113 'cited_source' ; charnames_to_nexus_text <- function(inarray, quotemark="") { tmpstr = c() for (i in 1 : (length(inarray)-1) ) { tmptmp = paste(i, " ", quotemark, inarray[i], quotemark, ", ", sep="") tmpstr = paste(tmpstr, tmptmp, sep="") } # add the last one, without comma tmpstr = paste(tmpstr, i, " ", quotemark, inarray[i+1], quotemark, " ", sep="") return(tmpstr) } chars_to_nexus_fmt <- function(charmatrix, namewidth=12) { outstr = "" width = ncol(charmatrix) for (i in 1:nrow(charmatrix)) { tmpstr1 = as.character(charmatrix$rownames[i]) len = nchar(tmpstr1, type="chars") numspaces_to_add = namewidth - len spaces_list = rep(" ", numspaces_to_add) tmpstr2 = array_to_text(c(tmpstr1, spaces_list), spacer="") chars = charmatrix[i,2:width] tmpstr3 = array_to_text(inarray=chars, spacer="") # check for "&" chars: convert 1&2 to (1 2) if (grepl("&", tmpstr3)) { for (j in 1:(width-1)) { if (grepl("&", chars[j])) { char = strsplit(as.character(chars[j]), "&") newchars_str = array_to_text(char[[1]], spacer=" ") newchar = paste("(", newchars_str, ")", sep="") chars[j] = newchar cat("Ambiguous character detected & converted...", charmatrix[i,1], ", char #", j, ": ", newchar, "\n") } } tmpstr3 = array_to_text(inarray=chars, spacer="") } tmpline = paste(" ", tmpstr2, tmpstr3, "\n", sep="") outstr = paste(outstr, tmpline, sep="") } outstr = trim(outstr) #outstr = paste(" ", outstr, sep="") return(outstr) } charmatrix_to_nexus_file <- function(charmatrix, nexus_outfn, quotemark="") { # initialize array of strings # l = lines l = c() l[1] = "#NEXUS" l[2] = "[ written by Nick Matzke R script 'quotescr_v11.R', written 9/1/2010 ]" l[3] = "" l[4] = "BEGIN TAXA;" l[5] = " TITLE Taxa;" l[6] = paste(" DIMENSIONS NTAX=", nrow(charmatrix), ";", sep="") l[7] = "TAXLABELS" tmpstr = array_to_text(charmatrix$rownames, spacer=" ") l[8] = paste(" ", tmpstr, sep="") l[9] = " ;" l[10] = "" l[11] = "END;" l[12] = "" l[13] = "" l[14] = "BEGIN CHARACTERS;" l[15] = " TITLE Character_Matrix;" l[16] = paste(" DIMENSIONS NCHAR=", (ncol(charmatrix)-1), ";", sep="") tmpstr = " FORMAT DATATYPE = STANDARD GAP = - MISSING = ?;" # maybe we can skip this? #SYMBOLS = 0 1 2 3 4 5 6 7 8 9 A B C D";" l[17] = tmpstr l[18] = " CHARSTATELABELS " tmpstr = charnames_to_nexus_text(inarray = names(charmatrix)[2:ncol(charmatrix)] , quotemark="") l[19] = paste(" ", tmpstr, ";", sep="") l[20] = " MATRIX" # assemble all the characters into one big string l[21] = chars_to_nexus_fmt(charmatrix, namewidth=11) l[22] = ";" l[23] = "" l[24] = "END;" write.table(l, nexus_outfn, append=FALSE, quote=FALSE, row.names=FALSE, col.names=FALSE) headf(nexus_outfn) tailf(nexus_outfn) cat("Character matrix output to NEXUS: '", nexus_outfn, "'\n") } charnames_to_tnt_text <- function(inarray, quotemark="") { tmpstr = c() for (i in 1:length(inarray) ) { tmptmp = paste("{", i-1, " ", quotemark, inarray[i], quotemark, ";\n", sep="") tmpstr = paste(tmpstr, tmptmp, sep="") } return(tmpstr) } chars_to_tnt_fmt <- function(charmatrix, namewidth=12) { outstr = "" width = ncol(charmatrix) for (i in 1:nrow(charmatrix)) { tmpstr1 = as.character(charmatrix$rownames[i]) #len = nchar(tmpstr1, type="chars") #numspaces_to_add = namewidth - len #spaces_list = rep(" ", numspaces_to_add) tmpstr2 = array_to_text(c(tmpstr1, " "), spacer="") chars = charmatrix[i,2:width] tmpstr3 = array_to_text(inarray=chars, spacer="") # check for "&" chars: convert 1&2 to (1 2) if (grepl("&", tmpstr3)) { for (j in 1:(width-1)) { if (grepl("&", chars[j])) { char = strsplit(as.character(chars[j]), "&") newchars_str = array_to_text(char[[1]], spacer=" ") newchar = paste("[", newchars_str, "]", sep="") chars[j] = newchar cat("Ambiguous character detected & converted...", charmatrix[i,1], ", char #", j, ": ", newchar, "\n") } } tmpstr3 = array_to_text(inarray=chars, spacer="") } tmpline = paste("", tmpstr2, tmpstr3, "\n", sep="") outstr = paste(outstr, tmpline, sep="") } outstr = trim(outstr) #outstr = paste("", outstr, sep="") return(outstr) } charmatrix_to_TNT_file <- function(charmatrix, tnt_outfn, quotemark="") { # initialize array of strings # l = lines l = c() l[1] = "xread" l[2] = paste(ncol(charmatrix)-1, nrow(charmatrix)) tmpstr = chars_to_tnt_fmt(charmatrix) l[3] = tmpstr l[4] = ";" l[5] = "" l[6] = "cnames" l[7] = charnames_to_tnt_text( names(charmatrix)[2:ncol(charmatrix)] ) l[8] = ";" l[9] = "" l[10] = "" l[11] = "proc /;" l[12] = "comments 0" l[13] = ";" l[14] = "" l[15] = "" write.table(l, tnt_outfn, append=FALSE, quote=FALSE, row.names=FALSE, col.names=FALSE) headf(tnt_outfn) tailf(tnt_outfn) cat("Character matrix output to TNT: '", tnt_outfn, "'\n") } scale_2values_axis <- function(observed_nums, observed_axis_inbreaks, desired_axis) { xlims_v1_min = min(observed_nums) xlims_v1_max = max(observed_nums) xlims_min_inbreaks = min(observed_axis_inbreaks) xlims_max_inbreaks = max(observed_axis_inbreaks) xlims_min_desired = min(desired_axis) xlims_max_desired = max(desired_axis) x = c(xlims_v1_min, xlims_v1_max) y = c(xlims_min_inbreaks, xlims_max_inbreaks) reg_result = lm(y~x) slope = reg_result$coefficients[2] intercept = reg_result$coefficients[1] xlims_touse_min = slope * xlims_min_desired + intercept xlims_touse_max = slope * xlims_max_desired + intercept xlims_touse = c(xlims_touse_min, xlims_touse_max) return(xlims_touse) } make_multihist <- function(l, desired_xlims, colors, numbreaks, tmptitle, xlabel) { for (i in 1:length(l)) { if (i==1) { hist(l[[i]], breaks=numbreaks, col=colors[i], border="white", xlim=desired_axis, main=tmptitle, xlab=xlabel, add=FALSE) } else { hist(l[[i]], breaks=numbreaks, col=colors[i], border="white", xlim=desired_axis, add=TRUE) } #col=colors, border=c("white"), xlim=xlims_touse, plot.it=FALSE) #plot(tmphist, col=colors[i], border="white", xlim=desired_axis) #tmphist = hist(l[[i]], breaks=numbreaks, plot=FALSE) #plot(tmphist, col=colors[i], border="white", add=TRUE) } #border=c("white"), #offsets = min(tmphist$breaks) #barwidth = tmphist$breaks[1]-tmphist$breaks[0] #xrange = c(0, max(tmphist$breaks)) #barplot(tmphist$out, offset=offsets), width=barwidth, xlim=c(min(xrange), max(xrange)), ylim=c(0, 1.2*max(tmphist$out))) #, beside=TRUE, col=colors, xlim=desired_axis, ylim=c(0, 1.2*max(tmphist$out)), bty="o", xaxt="n") #axis(1, at=pretty(), label=pretty(desired_axis), tick=TRUE, tcl=ticklength, pos=0) } # Limit each numeric column in a dataframe to e.g. 4 significant digits signif_digits_df <- function(dtf, numsig=4, printout=FALSE) { dtf_classes = cls.df(dtf, printout) for (i in 1:length(dtf_classes$cls_col_list)) { cls_col = dtf_classes$cls_col_list[i] if (cls_col == "numeric") { # Convert numeric column to one with signif digits colname = dtf_classes$dtf_names[i] cmdstr = paste("dtf$", colname, " = signif(dtf$", colname, ", digits=", numsig, ")", sep="") eval(parse(text = cmdstr)) } } return(dtf) } read_table_good <- function(fn) { # Read in the data, store in variable d # This has all of Nick's preferred options dtf = read.table(fn, header=TRUE, sep=" ", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE) return(dtf) } write_table_good <- function(dtf, outfn) { write.table(dtf, file=outfn, append=FALSE, quote=FALSE, sep=" ", row.names=FALSE, col.names=TRUE) } write_line_good <- function(dtf, outfn) { write.table(dtf, file=outfn, append=TRUE, quote=FALSE, sep=" ", row.names=FALSE, col.names=TRUE) } write_table_good_w_rownames <- function(dtf, outfn) { write.table(dtf, file=outfn, append=FALSE, quote=FALSE, sep=" ", row.names=TRUE, col.names=TRUE) } is.not.NA <- function(x) { return(is.na(x) == FALSE) } is.not.na <- function(x) { return(is.na(x) == FALSE) } # remove NA rows from list na.remove <- function(x) { outx = x[is.not.NA(x)] return(outx) } allneg <- function(inlist) { x = (inlist <= 0) sumneg = sum(x, na.rm=TRUE) lenneg = length(na.remove(x)) if (lenneg == sumneg) { return(TRUE) } else { return(FALSE) } } allpos <- function(inlist) { x = (inlist >= 0) sumpos = sum(x, na.rm=TRUE) lenpos = length(na.remove(x)) if (lenpos == sumpos) { return(TRUE) } else { return(FALSE) } } remove_NA_rows <- function(dtf, colname) { cat("\n") cat("remove_NA_rows: initial #rows = ", nrow(dtf), "\n") cmdstr = paste("rows_to_keep = is.not.NA(dtf$", colname, ")", sep="") #print(cmdstr) eval(parse(text = cmdstr)) #print(dim(df)) #print(length(rows_to_keep)) #print(rows_to_keep) outdf = dtf[rows_to_keep, ] cat("remove_NA_rows: ending #rows = ", nrow(outdf), "\n") return(outdf) } # check if all elements match do_all_match <- function(arr1, arr2) { matches = (arr1 == arr2) if (sum(matches) == length(matches)) { return(TRUE) } else { return(FALSE) } } # split strings on whitespace strsplit_whitespace <- function(tmpline) { # split on 1 or more whitespaces temp = strsplit(tmpline, "[ \t]+") # get the list list_of_strs = temp[[1]] # remove any leading/trailing "" list_of_strs = list_of_strs[list_of_strs != ""] return(list_of_strs) } # split strings on whitespace strsplit_on_tabs_remove_whitespace <- function(tmpline) { # split on 1 or more whitespaces temp = strsplit(tmpline, "[\t]") # get the list list_of_strs = temp[[1]] # remove any leading/trailing "" list_of_strs = list_of_strs[list_of_strs != ""] for (i in 1:length(list_of_strs)) { tmpstr = list_of_strs[i] list_of_strs[i] = gsub(" ", "", tmpstr) } return(list_of_strs) } # catch "integer(0)" e.g. from non-matching grep is.numzero <- function(x) { if (length(x) == 0) { return(TRUE) } else { return(FALSE) } } is.inf <- function(x) { return(x == Inf) } is.not.inf <- function(x) { return(is.inf(x) == FALSE) } return_items_not_NA <- function(x) { y = x[is.not.NA(x)] return(y) } plot_point <- function(x, y, pt_title, color) { points(x, y, type="p", pch=19, cex=2, col=color) text(x, y, pt_title, col=color, pos=4, bg="white") } # print out the last 5 rows of a data frame foot <- function(dtf) { nrows = nrow(dtf) print(dtf[((nrows-5) : nrows), ]) } # print out the header, footer, and basic info on a dataframe (df) sumdf <- function(dtf) { print(dim(dtf)) print(head(dtf)) foot(dtf) # print the class of each column for (col in 1:ncol(dtf)) { cat("Col#", col, " class = ", class(dtf[1,col]), "\n", sep="") } } make_cdf <- function(list_rel_probs) { if (sum(list_rel_probs) == 0) { print("make_cdf() ERROR: sum(list_rel_probs) is ZERO") print(list_rel_probs) return(NA) } if (class(list_rel_probs) != "data.frame") { dim(list_rel_probs) = c(length(list_rel_probs), 1) list_rel_probs = data.frame(list_rel_probs) } cdf_rel_probs = rep(0, nrow(list_rel_probs)) dim(cdf_rel_probs) = c(length(cdf_rel_probs), 1) cdf_rel_probs = data.frame(cdf_rel_probs) cdf_rel_probs = cbind(cdf_rel_probs, cdf_rel_probs) names(cdf_rel_probs) = c("bottom", "top") for (i in 1:nrow(list_rel_probs)) { if (i==1) { cdf_rel_probs$bottom[i] = 0 cdf_rel_probs$top[i] = list_rel_probs[i, 1] } else { cdf_rel_probs$bottom[i] = cdf_rel_probs$top[i-1] cdf_rel_probs$top[i] = cdf_rel_probs$top[i-1] + list_rel_probs[i, 1] } } cdf_rel_probs_new = cdf_rel_probs / sum(list_rel_probs) return(cdf_rel_probs_new) } # empirical p-values (for empirical PDF, cdf, percentiles, quantiles) empirical_pval_fast <- function(x, observed_val) { num_lt_observed = sum(x <= observed_val) pval = num_lt_observed / length(x) return(pval) } # empirical p-values (for empirical PDF, cdf, percentiles, quantiles) empirical_pval_slow <- function(x, observed_val) { freqtab = f.freq(x) pval = freqtab$cumul_fraction[freqtab$x == observed_val] return(pval) } f.freq <- function(x) { freqtab <- data.frame(table(x)) freqtab$fraction <- freqtab$Freq / length(x) freqtab$cumul_fraction[1] <- freqtab$fraction[1] for(i in 2:length(freqtab[,1])) { freqtab$cumul_fraction[i] = freqtab$cumul_fraction[i-1] + freqtab$fraction[i] } return(freqtab) } # Convert unique items to characters unique_items_to_chars <- function(tempy, y, namestr) { col_w_name = match(namestr, names(tempy), nomatch=NaN) tmpname = names(y)[col_w_name] cmdstr = paste("charstates = unique(tempy$", tmpname, ")", sep="") eval(parse(text = cmdstr)) charstates = charstates[charstates != "?"] charindexes = seq(1, length(charstates)) cat("\n") cat(namestr, " - ", "# of states: ", length(charstates), "\n", sep="") cat("code state\n") # Display the characters for (i in 1:length(charstates)) { #y$General_Field[tempy$General_Field == charstates[i]] = mesquite_character_states[charindexes[i]] cat(mesquite_character_states[charindexes[i]], ": ", charstates[i], "\n", sep="") cmdstr = paste("y$", tmpname, "[tempy$", tmpname, " == charstates[i]] = mesquite_character_states[charindexes[i]]", sep="") eval(parse(text = cmdstr)) } return(y) } # Retrieve chars # Note: only works if there is a 1-to-1 mapping of codes to text # # inchars_outchars = retrieve_char_text(tempy, y, namestr) # inchars = inchars_outchars[[1]] # outchars = inchars_outchars[[2]] retrieve_char_text <- function(tempy, y, namestr) { tmpname = namestr # Get numerical characters cmdstr = paste("charstates = unique(tempy$", tmpname, ")", sep="") eval(parse(text = cmdstr)) # Get text states cmdstr = paste("textstates = unique(y$", tmpname, ")", sep="") eval(parse(text = cmdstr)) inchars = c() outchars = c() for (i in 1:length(charstates)) { charstate = charstates[i] charstate_index = match(charstate, charstates) textstate = textstates[charstate_index] inchars = c(inchars, charstate) outchars = c(outchars, textstate) } inchars_outchars = list(inchars, outchars) return(inchars_outchars) } print_chars <- function(y, namestr) { col_w_name = match(namestr, names(y), nomatch=NaN) tmpname = names(y)[col_w_name] cmdstr = paste("charstates = unique(y$", tmpname, ")", sep="") eval(parse(text = cmdstr)) cat("\n") cat(namestr, " - ", "# of states: ", length(charstates), "\n", sep="") cat("List of the character states:\n") # Display the characters for (i in 1:length(charstates)) { cat(charstates[i], "\n", sep="") } } print_char_key <- function(inchars, outchars) { cat("\n") cat("Printing codes for characters:\n") for (i in 1:length(inchars)) { cat(outchars[i], ": ", inchars[i], "\n", sep="") } } print_chars_catlist <- function(y, namestr) { mesquite_character_states = c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "K", "M", "N", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "a", "b", "c", "d", "e", "f", "g", "h", "l", "m", "n", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z") col_w_name = match(namestr, names(y), nomatch=NaN) tmpname = names(y)[col_w_name] cmdstr = paste("charstates = unique(y$", tmpname, ")", sep="") eval(parse(text = cmdstr)) cat("\n") cat(namestr, " - ", "# of states: ", length(charstates), "\n", sep="") cat("\n") cat("List of the character states:\n") # Display the characters -- text input cat("inchars = c(") for (i in 1:(length(charstates)-1)) { cat('"', charstates[i], '", \n', sep="") } cat('"', charstates[length(charstates)], '")\n', sep="") cat("\n") # Display the characters -- suggested coded output cat("outchars = c(") charcount = 0 for (i in 1:(length(charstates)-1)) { if (charstates[i] != "?") { charcount = charcount + 1 cat('"', mesquite_character_states[charcount], '", \n', sep="") } else { cat('"?"', ', \n', sep="") } } if (charstates[length(charstates)] != "?") { charcount = charcount + 1 cat('"', mesquite_character_states[charcount], '")\n', sep="") } else { cat('"?"', ')\n', sep="") } cat("\n") } convert_items_to_items <- function(y, namestr, name1list, name2list) { for (i in 1:length(name1list)) { name1 = name1list[i] name2 = name2list[i] y = convert_item_to_item(y, namestr, name1, name2) } return(y) } convert_item_to_item <- function(y, namestr, name1, name2) { col_w_name = match(namestr, names(y), nomatch=NaN) tmpname = names(y)[col_w_name] cmdstr = paste("y$", tmpname, "[y$", tmpname, ' == "', name1, '"] = "', name2, '"', sep="") print(cmdstr) eval(parse(text = cmdstr)) return(y) } convert_grepl_item_to_item <- function(y, namestr, name1, name2) { col_w_name = match(namestr, names(y), nomatch=NaN) tmpname = names(y)[col_w_name] cmdstr = paste("rows_to_change = grepl(name1, y$", tmpname, ")", sep="") eval(parse(text = cmdstr)) cmdstr = paste("y$", tmpname, "[rows_to_change] = '", name2, "'", sep="") print(cmdstr) eval(parse(text = cmdstr)) return(y) } # Extract rows with indices matching rows_to_keep gather_rows_with_indices <- function(tmptable, rows_to_keep) { # make empty matrix newtable = matrix(data=NA, nrow = length(rows_to_keep), ncol=ncol(tmptable)) names(newtable) = names(tmptable) for (i in 1:nrow(newtable)) { # insert the new row newtable = tmptable[rows_to_keep[i], ] } return(newtable) } # remove rows from temp_table, based on col matching list_to_remove remove_rows_with_list_to_remove <- function(temp_table, col, list_to_remove) { # get TRUE/FALSE for which rows in species col are undefined, according to list_to_remove truefalse_rows_match = match_list1_in_list2(col, list_to_remove) cat("remove_rows_with_list_to_remove(): initial table #rows = ", nrow(temp_table), "\n") cat("Removing ", (sum(truefalse_rows_match)), " rows with c(", list2str(c(list_to_remove)), ") in them.\n", sep="") temp_table = temp_table[truefalse_rows_match == FALSE, ] cat("remove_rows_with_list_to_remove(): revised table #rows = ", nrow(temp_table), "\n") return(temp_table) } list2str <- function(list1, spacer=" ") { for (i in 1:length(list1)) { if (i == 1) { tmpstr = as.character(list1[1]) if (length(list1) == 1) { return(tmpstr) } next } addstr = as.character(list1[i]) tmpstr = paste(tmpstr, addstr, sep=spacer) } return(tmpstr) } list2str_unlist <- function(list1, spacer=" ") { outlist = c("startlist") for (i in 1:length(list1)) { listitem = list1[i] if (length(listitem) > 1) { outlist = append(outlist, list2str(unlist(listitem), spacer=" ")) } else { addstr = as.character(list1[i]) #tmpstr = paste(tmpstr, addstr, sep=spacer) outlist = append(outlist, addstr) } } tmpstr = list2str(outlist, spacer=" ") return(tmpstr) } return_unlist <- function(list1, spacer=" ") { outlist = c("startlist") for (i in 1:length(list1)) { listitem = list1[i] if (length(listitem) > 1) { outlist = append(outlist, unlist(listitem), spacer=" ") } else { addstr = as.character(list1[i]) #tmpstr = paste(tmpstr, addstr, sep=spacer) outlist = append(outlist, addstr) } } return(outlist) } unlist_dtf_cols <- function(dtf, printflag=FALSE) { # Sometimes cbind makes each column a list, this can screw up use/searching of # the column later on. # Unlist each column... for (i in 1:ncol(dtf)) { tmpstr = paste("unlisting col: ", names(dtf)[i], "...", sep="") prflag(tmpstr, printflag=printflag) # catch a possible error from unlisting # the number of rows needs to stay the same!! tmpcol = unlist(dtf[, i]) if (length(tmpcol) != length(dtf[, i])) { tmpstr = paste("...failed! unlist(col) length=", length(tmpcol), "; nrow(dtf) = ", nrow(dtf), sep="") prflag(tmpstr, printflag=printflag) } else { dtf[, i] = tmpcol tmpstr = paste(" ", " ", sep="") prflag(tmpstr, printflag=printflag) } } return(dtf) } char_descriptions_to_NEXUS <- function(namestr_list, inchars_list, outchars_list) { # Create l, an empty list of lines... l = list() #l[(h=h+1)] = " CHARSTATELABELS" tmpcharstr_list = c() for (i in 1:length(namestr_list)) { tmpname = namestr_list[i] # filter out ? and '' inchars = inchars_list[[i]] outchars = outchars_list[[i]] chars_to_keep1 = outchars != "" chars_to_keep2 = outchars != "?" # remove the character states which are "" or "?" in the coded section chars_to_keep = ((chars_to_keep2 + chars_to_keep1) == 2) inchars2 = inchars[chars_to_keep] oldstr = "'" newstr = "" tmp_charstates_list_noapost = gsub(oldstr, newstr, inchars2) oldstr = " " newstr = "_" tmp_charstates_list_nospace = gsub(oldstr, newstr, tmp_charstates_list_noapost) #tmp_charstates_str = list2str(inchars_list[[i]], spacer='" "') tmp_charstates_str = list2str(tmp_charstates_list_nospace, spacer="' '") tmp_charstates_str2 = paste("'", tmp_charstates_str, "'", sep="") tmpcharstr = paste(i, tmpname, "/ ", tmp_charstates_str2) tmpcharstr_list = c(tmpcharstr_list, tmpcharstr) } bigstr_chardescs = list2str(tmpcharstr_list, spacer=", ") charstatelabels = paste(" ", bigstr_chardescs, " ;", sep="") return(charstatelabels) } # Add character state labels to a preexisting NEXUS file... add_charstatelabels_to_NEXUS <- function(charstatelabels, infn, outfn) { lines = scan(file=infn, what="character", sep="\n") # Go through lines line_with_charstatelabels = NULL for (i in 1:length(lines)) { # Get line line = lines[i] # Check if line matches CHARSTATELABELS if (grepl("CHARSTATELABELS", line)) { line_with_charstatelabels = i } else { blah=TRUE } } # Assuming you found line_with_charstatelabels... lines[line_with_charstatelabels+1] = charstatelabels write.table(lines, file=outfn, quote=FALSE, append=FALSE, sep="", row.names = FALSE, col.names=FALSE) return(lines) } count_rows_NOT_containing_substr_in_col <- function(dtf, col, tmp_substr) { list_to_kill = grepl(tmp_substr, col, fixed=TRUE) list_to_keep = (list_to_kill == FALSE) print(sum(list_to_keep)) return(sum(list_to_keep)) } count_rows_containing_substr_in_col <- function(dtf, col, tmp_substr) { list_to_keep = grepl(tmp_substr, col, fixed=TRUE) print(sum(list_to_keep)) return(sum(list_to_keep)) } remove_rows_containing_substr_in_col <- function(dtf, col, tmp_substr) { list_to_kill = grepl(tmp_substr, col, fixed=TRUE) list_to_keep = (list_to_kill == FALSE) return(dtf[list_to_keep, ]) } retain_rows_containing_substr_in_col <- function(dtf, col, tmp_substr) { list_to_keep = grepl(tmp_substr, col, fixed=TRUE) return(dtf[list_to_keep, ]) } extract_rows_with_col_matching_findthis_str <- function(temp_table, col, findthis) { # Extract North-America-specific modern extinctions matches = find_str_inlist(findthis, tmplist=col) cat("# of rows matching '", findthis, "': ", sum(matches), "\n", sep="") temp_table = temp_table[matches, ] return(temp_table) } # Find a string inside another string... find_instring <- function(findthis, string) { result = grep(findthis, string) if (length(result) > 0) { #print("match") match = TRUE } else { match = FALSE } return(match) } find_str_inlist <- function(findthis, tmplist) { # make empty list outlist = rep(NA, length(tmplist)) for (i in 1:length(tmplist)) { string = tmplist[i] outlist[i] = find_instring(findthis, string) } return(outlist) } # return matching TRUE/FALSE values # list1 (.e.g. a big list) TRUE if it is found in list2 (e.g. a smaller list) match_list1_in_list2 <- function(list1, list2) { matchlist = list1 %in% list2 return(matchlist) } compare_2cols_against_another_2cols_find_matching_pairs_in_list2 <- function(list1, list2) { list1_paste = paste(list1[,1], ",", list1[,2], sep="") list2_pasteA = paste(list2[,1], ",", list2[,2], sep="") list2_pasteB = paste(list2[,1], ",", list2[,2], sep="") list2_matches1 = match_list1_in_list2(list2_pasteA, list1_paste) list2_matches2 = match_list1_in_list2(list2_pasteB, list1_paste) list2_matches = list2_matches1 + list2_matches2 > 0 return(list2_matches) } # NOTE!!! THESE MATCH FUNCTIONS JUST RETURN THE *FIRST* MATCH, *NOT* ALL MATCHES # (argh) # return indices in 2nd list matching the first list # It WILL return one match for each item in the list, though... get_indices_where_list1_occurs_in_list2 <- function(list1, list2) { match_indices = match(list1, list2) return(match_indices) } get_indices_where_string_in_list1_occurs_in_list2 <- function(list1, list2) { match_indices = unlist(lapply(list1, get_index_in_list2_matching_string, list2)) match_indices[is.na(match_indices) == TRUE] = "" return(match_indices) } # returns the index of the FIRST hit get_index_in_list2_matching_string <- function(tmpstr, list2) { matches_TF = lapply(tmpstr, grepl, list2)[[1]] first_hit_index = match(TRUE, matches_TF) return(first_hit_index) } # return indices in 2nd list matching the first list get_indices_where_list1_occurs_in_list2_noNA <- function(list1, list2) { match_indices = match(list1, list2) match_indices = return_items_not_NA(match_indices) return(match_indices) } # this is f-ed up!! get_real_indices_where_list1_occurs_in_list2 <- function(list1, list2) { print("get_real_indices_where_list1_occurs_in_list2()") print("WARNING: THIS FUNCTION RETURNS THE INDICES OF NOT NA, NOT THE INDICES OF THE MATCHES") indices = 1:length(list1) matches_or_NAs = get_indices_where_list1_occurs_in_list2(list1, list2) not_NA = is.not.NA(matches_or_NAs) indices_to_return = indices[not_NA] return(indices_to_return) } # place items with string matching list2 in new column of size list2 #place_items_matching_str_in_list1_in_newcol <- function() # colmax, max.col colmax <- function(dtf) { maxvals = apply(dtf, 2, max, na.rm=TRUE) return(maxvals) } colmin <- function(dtf) { minvals = apply(dtf, 2, min, na.rm=TRUE) return(minvals) } normalize_by_colmax <- function(dtf) { maxvals = colmax(dtf) # this outputs the results into columns, instead of corresponding rows x = apply(dtf, 1, "/", maxvals) # so transpose normalized_df = t(x) normalized_df = as.data.frame(normalized_df) names(normalized_df) = names(dtf) return(normalized_df) } ############################################## # Matrix functions ############################################## # symmetric flip across the diagonal of a matrix flipdiag <- function(mat) { newmat = mat for (i in ncol(mat)) { for (j in nrow(mat)) { newmat[j,i] = mat[i,j] } } return(newmat) } nondiagTF <- function(m) { # Returns TF for non-diagonals diag(m) = "blah" TF = m != "blah" return(TF) } nondiag <- function(m) { # Returns non-diagonals nondiag_m = m[nondiagTF(m)] return(nondiag_m) } # Compare lower and upper diagonals compare_lower_upper_diag <- function(x) { xtri = x[lower.tri(x)] ytmp = flipdiag(x) ytri = ytmp[lower.tri(ytmp)] (cor(cbind(xtri, ytri))) dif = c(mean(xtri))-c(mean(ytri)) prstr = paste(mean(c(xtri)), mean(c(ytri)), dif, sep=" ") print(prstr) plot(xtri, ytri, type="p", pch=".") return(dif) } # Compare row/col compare_row_col <- function(x, rownum) { xnums = c(x[rownum, ]) ynums = c(x[, rownum]) (cor(cbind(xnums, ynums))) dif = c(mean(xnums))-c(mean(ynums)) prstr = paste(mean(c(xnums)), mean(c(ynums)), dif, sep=" ") print(prstr) plot(xnums, ynums, type="p", pch=".") return(dif) } # Find rows contains # (note that this findrows_containing <- function(x, thing_to_find) { rows_to_keep = 1:nrow(x) for (i in 1:nrow(x)) { if(sum(x[i, ] == thing_to_find, na.rm = TRUE) > 0) { rows_to_keep[i] = TRUE } else { rows_to_keep[i] = FALSE } } return(rows_to_keep) } # Find rows containing no NAs findrows_containing_noNA <- function(x) { rows_to_keep = is.na(rowSums(x)) return(rows_to_keep == FALSE) } # Find rows containing no NAs findrows_containing_noINF <- function(x) { rows_to_keep = is.inf(rowSums(x)) return(rows_to_keep == FALSE) } # Find rows containing no NAs, no Infs return_rows_containing_noNA_noInf <- function(x) { rows_to_keep = 1:nrow(x) rows_to_keep = findrows_containing_noNA(x) y = x[rows_to_keep, ] rows_to_keep = findrows_containing_noINF(y) z = y[rows_to_keep, ] return(z) } ############################################## # Plotting functions ############################################## # Make some subplots subplots <- function(nrows=1, ncols=2) { q = quartz(width=7*ncols, height=7*nrows) par(mfcol = c(nrows, ncols)) return(q) } # Pairs plot with histograms panel.hist <- function(x, ...) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(usr[1:2], 0, 1.5) ) h <- hist(x, plot = FALSE) breaks <- h$breaks; nB <- length(breaks) y <- h$counts; y <- y/max(y) rect(breaks[-nB], 0, breaks[-1], y, col="cyan", ...) } pairs_with_hist <- function(datamat, ...) { pairs(datamat, diag.panel=panel.hist, ...) return() } # ========================================================= # Google-related functions # ========================================================= # Copy the corrected google numbers to each matching quote # (one could also split the results amongst the matching quotes; see next function) copy_fixed_quantchars_to_each_matching <- function(quantchars, quotes_list_no_extra_spaces_to_google, fixed_goog, cols_in_fixed_goog, unique_quotes_to_google, divide_by_frequency = TRUE) { quantchar_copy_nums_to_repeated_quotes = NULL q = NULL # the full list of quotes (from whatever source; full quantchars dataset, or subset) rownames = quantchars$rownames quotetxt = quotes_list_no_extra_spaces_to_google numcols = ncol(quantchars) - 1 tmprow = rep(NA, numcols) # store the accumulating rows here tmpmat = NULL names_cols_in_fixed_goog = names(fixed_goog)[cols_in_fixed_goog] # default (no spreading out of Google Hits across multiple hits of quote) divide_by_factor = 1 for (i in 1:nrow(quantchars)) { tmp_quotetxt_in_bigmatrix_list = c(quotetxt[i]) tmp_quotetxt_in_bigmatrix = quotetxt[i] matching_unique_quotetxt_row = get_indices_where_list1_occurs_in_list2(tmp_quotetxt_in_bigmatrix_list, unique_quotes_to_google) #print(matching_unique_quotetxt_row) # get number of other guys showing that match in the big list num_in_biglist_matching = sum(quotes_list_no_extra_spaces_to_google == tmp_quotetxt_in_bigmatrix) print(num_in_biglist_matching) # if you do want to change the numbers by dividing equally amongst multiple hits of the quote, # do so here... if (divide_by_frequency == TRUE) { divide_by_factor = num_in_biglist_matching } else { pass = "blah" } tmprow = as.numeric(fixed_goog[matching_unique_quotetxt_row, cols_in_fixed_goog]) / divide_by_factor numchars = nchar(tmp_quotetxt_in_bigmatrix) numwords = length(strsplit(tmp_quotetxt_in_bigmatrix, " ")[[1]]) tmprow2 = c(as.data.frame(numchars), as.data.frame(numwords), tmprow) tmpmat = rbind(tmpmat, tmprow2) } tmpmat2 = cbind(as.data.frame(rownames), as.data.frame(quotetxt), as.data.frame(tmpmat, row.names=1:nrow(tmpmat))) outdf = as.data.frame(tmpmat2, row.names=1:nrow(tmpmat2)) names(outdf) = c("rownames", "quotetxt", "numchars", "numwords", names_cols_in_fixed_goog) quantchar_copy_nums_to_repeated_quotes = outdf return(quantchar_copy_nums_to_repeated_quotes) } # If a list has [[1]], [[2]], or is a dataframe, kill that with this get_items_from_nested_list <- function(nested_list) { tmpout = c() for (i in 1:length(nested_list)) { item = nested_list[[i]] tmpout = c(tmpout, item) } return(tmpout) } # If a list has [[1]], [[2]], or is a dataframe, kill that with this get_items_from_nonnested_list <- function(nested_list) { tmpout = c() for (i in 1:length(nested_list)) { item = nested_list[i] tmpout = c(tmpout, item) } return(tmpout) }