############################################################################## # Delta test, version 0.4.8 (beta release) # # Copyright (C) 2009-2013 by Maciej Eder, Jan Rybicki & Mike Kestemont. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # ############################################################################ # # To cite this script in publications you might use: # Eder, M., Rybicki, J. (2011). Stylometry with R. In "Digital Humanities # 2011: Conference Abstracts." Stanford University, Stanford, CA, pp. 308-11. # # Contact with the authors: # Maciej Eder # Jan Rybicki # Mike Kestemont # # ############################################################################ # # ver. 0.4.8, 2012/12/29 --> customizable plot area, font size, etc.; # thoroughly rewritten code for margins assignment; # scatterplots represented either by points, or # by labels, or by both (customizable label offset); # saving the words (features) actually used, # saving the table of actually used frequencies # ver. 0.4.7, 2012/11/25 --> new output/input extensions: optional custom list # of files to be analyzed, saving distance table(s) # to external files, support for TXM Textometrie # Project; color cluster analysis graphs (at last!) # ver. 0.4.6, 2012/09/09 --> code revised, cleaned, bugs fixed # ver. 0.4.5.4, 2012/09/03 -> added 2 new PCA visualization flavors # ver. 0.4.5.3, 2012/08/31 -> new GUI written # ver. 0.4.5.2, 2012/08/27 -> added functionality for normal sampling # ver. 0.4.5.1, 2012/08/22 -> support for Dutch added # ver. 0.4.5, 2012/07/07 --> option for choosing corpus files; code cleaned; # bugs fixed # ver. 0.4.4, 2012/05/31 --> the core code rewritten, the I/II set division # abandoned, GUI remodeled, tooltips added, # different input formats supported (xml etc.), # config options loaded from external file; # the code forked into (1) this script, supporting # explanatory analyses (MDS, Cons. Trees, ...), # (2) a script for machine-learning methods # ver. 0.4.3, 2012/04/28 --> feature selection (word and character n-grams) # ver. 0.4.2, 2012/02/10 --> three ways of splitting words in English; # bugs fixed; GUI code rearranged and simplified # ver. 0.4.1, 2011/06/27 --> better output; better text files uploading, # new options for culling and ranking of candidates # ver. 0.4.0, 2011/06/20 --> the official world-premiere (Stanford, CA) # ver. 0.3.9b, 2011/06/1 --> the code simplified; minor cleaning # ver. 0.3.9, 2011/05/21 --> uploading wordlist from external source; # thousands of improvements; the code simplified # ver. 0.3.8, 2010/11/01 --> skip top frequency words option added # ver. 0.3.7, 2010/11/01 --> better graphs; attempt at better graph layout # ver. 0.3.6, 2010/07/31 --> more graphic options; dozens of improvements # ver. 0.3.5, 2010/07/19 --> module for color graphs; module for PCA # ver. 0.3.4, 2010/07/12 --> module for uploading corpus files improved # ver. 0.3.3, 2010/06/03 --> the core code simplified and improved (faster!) # ver. 0.3.2, 2010/05/10 --> reordered GUI; minor cleaning # ver. 0.3.1, 2010/05/10 --> the z-scores module improved # ver. 0.3.0, 2009/12/26 --> better counter of ’good guesses’; option for # randomly generated samples; minor improvements # ver. 0.2.99, 2009/12/25 --> platform-independent outputfile saving # ver. 0.2.98, 2009/12/24 --> GUI thoroughly integrated with initial variables # ver. 0.2.10, 2009/11/28 --> corrected MFW display in graph, more analysis # description in outputfile # ver. 0.2.9, 2009/11/22 --> auto graphs for MSD and CA # ver. 0.2.8a, 2009/11/21 --> remodeled GUI # ver. 0.2.8, 2009/11/20 --> GUI: radiobuttons, checkbuttons # ver. 0.2.7, 2009/11/19 --> language-determined pronoun selection # ver. 0.2.6, 2009/11/18 --> dialog box (GUI) # ver. 0.2.5, 2009/11/16 --> module for different distance measures; # thousands of improvements (I/O, interface, etc.) # ver. 0.2.2, 2009/10/25 --> numerous little improvements; deleting pronouns # ver. 0.2.1, 2009/08/23 --> module for culling; module for bootstrapping # ver. 0.2.0, 2009/08/23 --> module for uploading plain text files # ver. 0.1.9, 2009/07/31 --> numerous improvements, the code simplified # ver. 0.1.4, 2009/07/19 --> loop for different MFW settings # ver. 0.0.1, 2009/07/01 --> some bash and awk scripts translated into R ############################################################################## # Before defining any new objects (variables), all the existing R objects are # recorded to preserve them from being removed; however, they are still # vulnerable to being overwritten! variables.not.to.be.removed = ls() label.offset = 3 add.to.margins = 2 # "labels" || "points" || "both" text.id.on.graphs = "labels" ###################################### new.flavor.of.cluster.analysis = FALSE ###################################### # loading the required library if(new.flavor.of.cluster.analysis == TRUE) { library(ape) } ####### GENERAL SETTINGS (GUI/TEXT-MODE) ################################### # If you wish to use a simple yet effective graphical interface (GUI), # just set the following option to TRUE, otherwise switch this option to FALSE # and edit manually the rest of variables (see below). # If you switch this option on, the values indicated in the following sections # will serve as default for the GUI for the first run of the script on a corpus. # In the subsequent runs, last values will appear as default in the GUI. interactive.mode.with.GUI = TRUE ####### TEXT- AND LANGUAGE-DEPENDENT SETTINGS #################### # format of corpus files; available choices are: # "plain", "xml", "xml.drama", "xml.notitles", "html" corpus.format = "plain" # how many MFW ("Most frequent Words") should be taken into analysis # (if mfw.min value = max.mfw, then no multiple iterations will be computed) # start.at option enables skipping top frequency words: you should # indicate the desired start position of your list (in most cases you will # probably prefer setting it to 1, the rank of the single most frequent word, # so that no words are skipped at the top of the frequency spectrum). mfw.min = 100 mfw.max = 100 mfw.incr = 100 start.at = 1 # culling rate specifies the percentage of texts in a corpus in which a given word # must be found in order to be included in the analysis. Thus, a 100% culling # rate limits the analysis to words that appear at least once in every text # in the corpus; at a 50% culling rate, a word is included into the analysis # when it appears in at least half of the texts in the corpus; a 0% culling # rate (or no culling) means that no words are omitted. # about min=max: see above culling.min = 0 culling.max = 0 culling.incr = 20 # Deleting pronouns (this is independent of the culling procedure). # If the "delete pronouns" option is switched to TRUE, choose one language # of the following: English, Polish, Latin, French, German, Italian, Hungarian, Dutch, Spanish # (the editable lists of pronouns are available below; see: advanced settings). # Additionally, there are a few variants of language settings available: # English.contr, English.all, and Latin.corr. Their meaning is as follows: # "English.contr": treats the contractions as single words, i.e. strings # such as "don't", "you've" etc. will not be split into two words. # "English.all": keeps the contractions (as above), and also prevents # from splitting compound words (mother-in-law, double-decker, etc.) # "Latin.corr": since some editions do not distinguish the letters v/u, # this option provides a consistent conversion to "u" in each text. delete.pronouns = FALSE corpus.lang = "English.all" # Selection of features. In classical approaches, frequencies of the most # frequent words (MFW) are used as the basis for multidimensional analyses. # It has been argued, however, that other features are also worth considering, # especially word and/or character n-grams. The general concept of n-grams # is to divide a string of single words/characters into a sequence of n # elements. Given a sample sentence "This is a simple example", the character # 2-grams are as follows: "th", "hi", "is", "s ", " i", "is", "s ", " a", "a ", # " s", "si", "im", "mp", etc. The same sentence split into word 2-grams: # "this is", "is a", "a simple", "simple sentence". # Another question is whether it really increases the accuracy of attribution; # further reading: Eder, M. (2011). Style-markers in authorship attribution: # A cross-language study of the authorial fingerprint, ’Studies in Polish # Linguistics’ 6: 101-16. # Two types of n-grams are available: characters (option "c"), and words ("w"). analyzed.features = "w" ngram.size = 1 ####### MATHEMATICAL SETTINGS (DISTANCE MEASURE) ################# # Strictly speaking, the choice of the appropriate distance measure # is the core of the statistical procedure provided by this script. # (However, the distance measures do not apply to the PCA method) # Although this choice is not easy, some of the following measures # seem to be more suitable for linguistic purposes than others. # On theoretical grounds, Euclidean Distance and Manhattan # Distance should be avoided in stylometry. Canberra Distance is quite # troublesome but effective e.g. for Latin (it should be combined with # careful culling settings and a limited number of MFW taken into analysis). # For English, usually Classic Delta is a good choice. A theoretical # explanation of the measures implemented in this script is pending. # # The available distance measures (choose ONE) are as follows: # "CD" --> Classic Delta as developed by Burrows # "AL" --> Argamon’s Linear Delta (based on Euclidean principles) # "ED" --> Eder’s Delta (explanation and mathematical equation: soon) # "ES" --> Eder’s Simple (explanation and mathematical equation: soon) # "MH" --> Manhattan Distance (obvious and well documented) # "CB" --> Canberra Distance (risky, but sometimes amazingly good) # "EU" --> Euclidean Distance (basic, and the most "natural") distance.measure = "CD" ######## VISUALIZATION METHODS, LOGS, REPORTS, DISPLAY OPTIONS ############ # Statistical method to be used; choose one: # Cluster Analysis: "CA" # Multidimensional Scaling: "MDS" # Principal Components Analysis (based on a covariance table): "PCV" # Principal Components Analysis (based on a correlation table): "PCR" # Bootstrap Consensus Tree: "BCT". # # Note on the bootstrap procedure: multiple iterations will build # a consensus tree # ATTENTION: this option requires the ape library, which you can install at # any time using the "install.packages()" command. consensus.strength = 0.5 analysis.type="CA" # Do you want to display the graph on the screen? # Do you want to write the graph directly to a graphics file? Which format? # You can display the graph on the screen AND write to a file (the latter # will be done with much better quality). display.on.screen = TRUE write.pdf.file = FALSE write.jpg.file = FALSE write.emf.file = FALSE # Windows only write.png.file = FALSE # dimensions of the plot area (expressed in inches), font size, # thickness of the lines used to plot the graphs. # Since it is usually hard to remember all the values, an additional option # is provided to reset the picture options -- if this is switched on, # the remaining options will be overwritten plot.options.reset = FALSE plot.custom.height = 7 plot.custom.width = 7 plot.font.size = 10 plot.line.thickness = 1 # Do you want the graphs colored? The script will automatically assign # the same colors to those texts that have the same first segment of their # file names (the first string ending in "_"). # Available options: "colors" || "grayscale" || "black" colors.on.graphs ="colors" # Do you want titles on your graphs, listing the most important parameters? titles.on.graphs = TRUE # Layout of dendrogram: horizontal/vertical (Cluster Analysis only) dendrogram.layout.horizontal = TRUE # Initialize pca VISUALIZATION options; choose one: # "classic", "loadings", "technical", "symbols" pca.visual.flavour = "classic" # || "technical" || "symbols" # Sometimes, you might want to save computed table(s) of distances # for further analysis. Switch the following option TRUE to make it possible. # The same applies to the list of words (or other features) actually used # in analysis, i.e. after culling, pronoun deletion, etc. Again, one might # want to save the table of frequencies actually used save.distance.tables = FALSE save.analyzed.features = FALSE save.analyzed.freqs = FALSE ####### ADVANCED SETTINGS (FOR EXPERTS ONLY) ######################## # Normally, the script computes a huge table of thousands # of word frequencies for all texts in your corpus. This is a non-trivial # and time-consuming task. If done once, there is no need to waste time # and do it again, because the tables are also saved in the output file # "table_with_frequencies.txt". To retrieve all the word # frequencies from the file, switch this option to TRUE. # BUT it MUST be set to FALSE when you switch corpora in the same R session, # or when you switch from word to character analysis, or change your n for # your n-grams (or if you suddenly realize you’ve picked the # wrong language! use.existing.freq.tables = FALSE # Some people like to see what’s going on, and to be able to revise/edit # the list of words for analysis. To meet their wishes, the script # saves the list into a separate output file, "wordlist.txt". # You can add any words to the list and either delete as many as you want, # or mark the unwanted words with "#" (just like these comments are marked). # Switching this option on prevents the script from overwriting the file, # and makes sure that the wordlist is loaded from there. use.existing.wordlist = FALSE # Otherwise, select files manually. interactive.files = FALSE # Another option makes it possible to upload the files using an external list # of files. It should be named "files_to_analyze.txt" and be put into the working # directory. The items (i.g. file names) should be separated either by spaces, # tabs, or newlines. The delimiters can be mixed and/or multiplied, thus even # a very untidy list will be interpreted correctly. use.custom.list.of.files = TRUE # Usually, it is recommended to cut off the tail of the word-list; # if you do not want to cut the list, then the variable may be set to an # absurdly big number, or to "mfw.list.cutoff = mfw.list.of.all" # (and then you are advised to use a fast computer). mfw.list.cutoff = 5000 # pronouns (and other words) to be deleted # * what are the selection criteria used here? Personal, possessive, ...? * # Polish pol.pronouns = c("ci", "ciebie", "cię", "go", "ich", "im", "ja", "ją", "je", "jego", "jej", "jemu", "ma", "mą", "me", "mego", "mej", "memu", "mi", "mną", "mnie", "moi", "moich", "moim", "moimi", "moja", "moją", "moje", "mojego", "mojej", "mojemu", "mój", "mu", "my", "mych", "mym", "mymi", "nam", "nami", "nas", "nią", "nich", "nie", "niego", "niej", "niemu", "nim", "nimi", "on", "ona", "one", "oni", "ono", "swa", "swą", "swe", "swego", "swej", "swemu", "swoi", "swoich", "swoim", "swoimi", "swoja", "swoją", "swoje", "swojego", "swojej", "swojemu", "swój", "swych", "swym", "swymi", "tobą", "tobie", "twa", "twą", "twe", "twego", "twej", "twemu", "twoi", "twoich", "twoim", "twoimi", "twoja", "twoją", "twoje", "twojego", "twojej", "twojemu", "twój", "twych", "twym", "twymi", "ty", "wam", "wami", "was", "wy", "wasz", "wasza", "wasze", "waszym", "waszymi", "waszych", "waszego", "waszej", "waszą") # English eng.pronouns = c("he", "her", "hers", "herself", "him", "himself", "his", "i", "me", "mine", "my", "myself", "our", "ours", "ourselves", "she", "thee", "their", "them", "themselves", "they", "thou", "thy", "thyself", "us", "we", "ye", "you", "your", "yours", "yourself") # Latin lat.pronouns = c("ea", "eae", "eam", "earum", "eas", "ego", "ei", "eis", "eius", "eo", "eorum", "eos", "eum", "id", "illa", "illae", "illam", "illarum", "illas", "ille", "illi", "illis", "illius", "illo", "illorum", "illos", "illud", "illum", "is", "me", "mea", "meae", "meam", "mearum", "meas", "mei", "meis", "meo", "meos", "meorum", "meum", "meus", "mihi", "nobis", "nos", "noster", "nostra", "nostrae", "nostram", "nostrarum", "nostras", "nostri", "nostris", "nostro", "nostros", "nostrorum", "nostrum", "sua", "suae", "suam", "suarum", "suas", "sui", "suis", "suo", "suos", "suorum", "suum", "suus", "te", "tibi", "tu", "tua", "tuae", "tuam", "tuarum", "tuas", "tui", "tuis", "tuo", "tuos", "tuorum", "tuum", "tuus", "vester", "vestra", "vestrae", "vestram", "vestrarum", "vestras", "vestri", "vestris", "vestro", "vestros", "vestrorum", "vestrum", "vobis", "vos") # French fra.pronouns = c("je", "me", "moi", "tu", "te", "toi", "il", "elle", "le", "la", "lui", "se", "lui", "elle", "soi", "nous", "vous", "ils", "elles", "les", "leur", "se", "eux", "elles", "soi") # German ger.pronouns = c("ich", "mich", "mir", "mein", "meine", "meiner", "meines", "du", "dich", "dir", "dein", "deine", "deiner", "deines", "er", "sich", "ihr", "ihrer", "ihn", "ihnen", "sein", "seiner", "seines", "seine", "sie", "wir", "uns", "unser", "unsere", "euch", "eure", "euer") # Italian ita.pronouns = c("ci", "gli", "io", "la", "le", "lei", "li", "loro", "lo", "lui", "me", "mi", "noi", "si", "te", "ti", "tu", "vi", "voi", "egli", "ella", "esso", "essa", "essi", "esse", "mio", "mia", "miei", "mie", "tuo", "tua", "tuoi", "tue", "suo", "sua", "suoi", "sue", "nostro", "nostra", "nostri", "nostre", "vostro", "vostra", "vostri", "vostre", "loro", "loro", "loro", "loro") # Hungarian hun.pronouns = c("annak", "az", "azzal", "bele", "belé", "beléd", "beléje", "beléjük", "belém", "belénk", "belétek", "belöle", "belőled", "belőlem", "belőletek", "belőlük", "belőlünk", "benne", "benned", "bennem", "bennetek", "bennük", "bennünk", "én", "ennek", "enyéim", "enyém", "enyémek", "érte", "érted", "értem", "értetek", "értük", "értünk", "ez", "ezzel", "hozzá", "hozzád", "hozzája", "hozzájuk", "hozzám", "hozzánk", "hozzátok", "maga", "magáé", "magáéi", "maguk", "maguké", "magukéi", "mi", "mieink", "mienk", "miénk", "nála", "nálad", "nálam", "nálatok", "náluk", "nálunk", "neked", "nekem", "neki", "nekik", "nektek", "nekünk", "ő", "ők", "ön", "öné", "önéi", "önnek", "önnel", "önök", "önöké", "önökéi", "önökkel", "önöknek", "övé", "övéi", "övéik", "övék", "rád", "rája", "rajta", "rajtad", "rajtam", "rajtatok", "rajtuk", "rajtunk", "rájuk", "rám", "ránk", "rátok", "róla", "rólad", "rólam", "rólatok", "róluk", "rólunk", "te", "ti", "tied", "tiéd", "tieid", "tieitek ", "tietek", "tiétek", "tőle", "tőled", "tőlem", "töletek", "tőlük", "tőlünk", "vele", "veled", "velem", "veletek", "velük", "velünk") # Dutch dut.pronouns = c("hij", "haar", "haarzelf", "hijzelf", "hemzelf", "hem", "ik", "ikzelf", "mijn", "mij", "mijzelf", "me", "mezelf", "zich", "zichzelf", "ons", "onze", "onszelf", "u", "uw", "uzelf", "zij", "zijzelf", "wij", "wijzelf", "jij", "jijzelf", "jouw", "jouwe", "jou", "jouzelf", "elkaar", "hen", "henzelf", "hun", "hunzelf", "zich", "elkaar", "wie", "wat", "welke") # Spanish sp.pronouns = c("yo", "me", "mí", "tú", "te", "ti", "usted", "ud", "le", "lo", "la", "se", "sí", "él", "lo", "ella", "nos", "nosotros", "nosotras", "vosotros", "vosotras", "ustedes", "ud", "les", "los", "las", "se", "ellos", "los", "ellas") # This option enables integration with TXM corpus management system # (see: Textometrie Project, http://textometrie.ens-lyon.fr/). # Normally, it should be switched off, since it is used only when the script # is invoked from inside the TXM environment. WARNING: experimental. txm.compatibility.mode = FALSE ######## SAMPLING OPTIONS ############ sampling = "no.sampling" # || "random.sampling" || "no.sampling" # When dealing with longer text, one might want to divide these in samples of # an equal size. This can be achieved by setting the sampling variable # (default="normal.sampling") and specifying the number of words per sample # via the sample.size parameter: Integer, default=10000). sample.size = 10000 # expressed in words, also if you’re using character n-grams # when the analyzed texts are significantly unequal in length, it is not a bad # idea to prepare samples as randomly chosen "bags of words". For this, set the # sampling variable to "random.sampling". The desired size of the sample should # be indicated via the length.of.random.sample variable. # Sampling with and without replacement is also available. # (Further reading: Eder, M. (2010). Does Size Matter? Authorship Attribution, # Short Samples, Big Problem. In "Digital Humanities 2010: Conference # Abstracts." King’s College London 2010, pp. 132-35.) # # ATTENTION: this makes sense only if "use.existing.freq.tables" is set "FALSE" length.of.random.sample = 10000 sampling.with.replacement = FALSE # It is also possible to use the entire corpus texts as samples (regardless # of their length and differences therein). For this, set the sampling variable # to "no.sampling" # the variables are now ready to use (unless the GUI option was chosen) # ################################################################### # ################################################# # sanity check for some of the initial variables -- just in case # ################################################# # Given a language option ("English", "Polish", "Latin" etc., as described # above), this procedure selects one of the lists of pronouns # If no language was chosen (or if a desired language is not supported, or if # there was a spelling mistake), then the variable will be set to "English". # If "Pronouns deleted" is set to FALSE, this is immaterial. if(exists("pronouns") == FALSE) # checking if the "pronouns" box is empty pronouns = eng.pronouns # This prevents us from choosing a non-existing distance measure -- in such # case the default distance (Classic Delta) will be switched on. Be aware # of correct spelling: then the default value will be assigned as well! if(distance.measure %in% c("CD","AL","ED","ES","MH","CB","EU") == FALSE) { distance.measure = "CD" } # ################################################# # ################################################# # # the GUI module # # ################################################# # At the beginning of the script, you could decide whether use the GUI module # or not; if the appropriate option was switched on, the GUI will start now; # Since it’s written in TclTk, with some additional twists, you need to install # the tcltk2 package (on top of the regular tcltk, which is usually installed # with R anyway. if (interactive.mode.with.GUI == TRUE) { library(tcltk) library(tcltk2) if(file.exists("config.txt") == TRUE) { source("config.txt") } # ################################################################### .Tcl("font create myDefaultFont -family tahoma -size 8") .Tcl("option add *font myDefaultFont") cancel_pause <- FALSE tt <- tktoplevel() tktitle(tt) <- "Stylometry with R: enter analysis parameters" push_OK <- function(){ cancel_pause <<- TRUE tkdestroy(tt) } corpus.format <- tclVar(corpus.format) mfw.min <- tclVar(mfw.min) mfw.max <- tclVar(mfw.max) mfw.incr <- tclVar(mfw.incr) start.at <- tclVar(start.at) culling.min <- tclVar(culling.min) culling.max <- tclVar(culling.max) culling.incr <- tclVar(culling.incr) ngram.size <- tclVar(ngram.size) analyzed.features <- tclVar(analyzed.features) use.existing.freq.tables <- tclVar(use.existing.freq.tables) use.existing.wordlist <- tclVar(use.existing.wordlist) interactive.files <- tclVar(interactive.files) use.custom.list.of.files <- tclVar(use.custom.list.of.files) mfw.list.cutoff <- tclVar(mfw.list.cutoff) analysis.type <- tclVar(analysis.type) delete.pronouns <- tclVar(delete.pronouns) corpus.lang <- tclVar(corpus.lang) distance.measure <- tclVar(distance.measure) display.on.screen <- tclVar(display.on.screen) write.pdf.file <- tclVar(write.pdf.file) write.jpg.file <- tclVar(write.jpg.file) write.emf.file <- tclVar(write.emf.file) write.png.file <- tclVar(write.png.file) colors.on.graphs <- tclVar(colors.on.graphs) titles.on.graphs <- tclVar(titles.on.graphs) dendrogram.layout.horizontal <- tclVar(dendrogram.layout.horizontal) pca.visual.flavour <- tclVar(pca.visual.flavour) save.distance.tables <- tclVar(save.distance.tables) save.analyzed.features <- tclVar(save.analyzed.features) save.analyzed.freqs <- tclVar(save.analyzed.freqs) sampling <- tclVar(sampling) sample.size <- tclVar(sample.size) length.of.random.sample <- tclVar(length.of.random.sample) consensus.strength <- tclVar(consensus.strength) plot.options.reset <- tclVar(plot.options.reset) plot.custom.height <- tclVar(plot.custom.height) plot.custom.width <- tclVar(plot.custom.width) plot.font.size <- tclVar(plot.font.size) plot.line.thickness <- tclVar(plot.line.thickness) text.id.on.graphs <- tclVar(text.id.on.graphs) add.to.margins <- tclVar(add.to.margins) label.offset <- tclVar(label.offset) f1 <- tkframe(tt) f2 <- tkframe(tt) f3 <- tkframe(tt) f4 <- tkframe(tt) f5 <- tkframe(tt) # layout of the GUI begins here: tab1 <- function() { tkgrid(f1,row=1,column=0,columnspan=5) tkgrid.forget(f2) tkgrid.forget(f3) tkgrid.forget(f4) tkgrid.forget(f5) tkconfigure(t1.but,state="disabled", background="white") tkconfigure(t2.but,state="normal", background="aliceblue") tkconfigure(t3.but,state="normal", background="aliceblue") tkconfigure(t4.but,state="normal", background="aliceblue") tkconfigure(t5.but,state="normal", background="aliceblue") } tab2 <- function() { tkgrid(f2,row=1,column=0,columnspan=5) tkgrid.forget(f1) tkgrid.forget(f3) tkgrid.forget(f4) tkgrid.forget(f5) tkconfigure(t2.but,state="disabled", background="white") tkconfigure(t1.but,state="normal", background="aliceblue") tkconfigure(t3.but,state="normal", background="aliceblue") tkconfigure(t4.but,state="normal", background="aliceblue") tkconfigure(t5.but,state="normal", background="aliceblue") } tab3 <- function() { tkgrid(f3,row=1,column=0,columnspan=5) tkgrid.forget(f1) tkgrid.forget(f2) tkgrid.forget(f4) tkgrid.forget(f5) tkconfigure(t3.but,state="disabled", background="white") tkconfigure(t1.but,state="normal", background="aliceblue") tkconfigure(t2.but,state="normal", background="aliceblue") tkconfigure(t4.but,state="normal", background="aliceblue") tkconfigure(t5.but,state="normal", background="aliceblue") } tab4 <- function() { tkgrid(f4,row=1,column=0,columnspan=5) tkgrid.forget(f1) tkgrid.forget(f2) tkgrid.forget(f3) tkgrid.forget(f5) tkconfigure(t4.but,state="disabled", background="white") tkconfigure(t1.but,state="normal", background="aliceblue") tkconfigure(t2.but,state="normal", background="aliceblue") tkconfigure(t3.but,state="normal", background="aliceblue") tkconfigure(t5.but,state="normal", background="aliceblue") } tab5 <- function() { tkgrid(f5,row=1,column=0,columnspan=5) tkgrid.forget(f1) tkgrid.forget(f2) tkgrid.forget(f3) tkgrid.forget(f4) tkconfigure(t5.but,state="disabled", background="white") tkconfigure(t1.but,state="normal", background="aliceblue") tkconfigure(t2.but,state="normal", background="aliceblue") tkconfigure(t3.but,state="normal", background="aliceblue") tkconfigure(t4.but,state="normal", background="aliceblue") } t1.but <- tkbutton(tt,text=" INPUT & LANGUAGE ",command=tab1) t2.but <- tkbutton(tt,text=" FEATURES ",command=tab2) t3.but <- tkbutton(tt,text=" STATISTICS ",command=tab3) t4.but <- tkbutton(tt,text=" SAMPLING ",command=tab4) t5.but <- tkbutton(tt,text=" OUTPUT ",command=tab5) tkgrid(t1.but) tkgrid(t2.but, column=1, row=0) tkgrid(t3.but, column=2, row=0) tkgrid(t4.but, column=3, row=0) tkgrid(t5.but, column=4, row=0) # Grid for individual tabs # initial state! tkgrid(f1,row=1,column=0,columnspan=5) tkconfigure(t1.but,state="disabled", background="white") tkconfigure(t2.but,state="normal", background="aliceblue") tkconfigure(t3.but,state="normal", background="aliceblue") tkconfigure(t4.but,state="normal", background="aliceblue") tkconfigure(t5.but,state="normal", background="aliceblue") # the OK button: active on each tab # button_1 <- tkbutton(tt,text=" OK ",command=push_OK,relief="raised",background="aliceblue") tkbind(button_1,"",push_OK) tkgrid(button_1,columnspan=10) tk2tip(button_1, "Press this only if you've visited all the tabs, or if you know\nyou want to leave values in some as they are.") ######################################################################################################################## # layout of the GUI begins here: # tkgrid(tklabel(f1,text=" "),padx=0,pady=0) # blank line (serving as the top margin) tkgrid(tklabel(f2,text=" ")) # blank line (serving as the top margin) tkgrid(tklabel(f3,text=" ")) # blank line (serving as the top margin) tkgrid(tklabel(f4,text=" ")) # blank line (serving as the top margin) tkgrid(tklabel(f5,text=" ")) # blank line (serving as the top margin) # first row: INPUT # entry_TXT <- tkradiobutton(f1) entry_XML <- tkradiobutton(f1) entry_XMLDrama <- tkradiobutton(f1) entry_XMLNoTitles <- tkradiobutton(f1) entry_HTML <- tkradiobutton(f1) # tkconfigure(entry_TXT,variable=corpus.format,value="plain") tkconfigure(entry_XML,variable=corpus.format,value="xml") tkconfigure(entry_XMLDrama,variable=corpus.format,value="xml.drama") tkconfigure(entry_XMLNoTitles,variable=corpus.format,value="xml.notitles") tkconfigure(entry_HTML,variable=corpus.format,value="html") # entrylabel_TXT <- tklabel(f1,text="plain text") entrylabel_XML <- tklabel(f1,text="xml") entrylabel_XMLDrama <- tklabel(f1,text="xml (plays)") entrylabel_XMLNoTitles <- tklabel(f1,text="xml (no titles)") entrylabel_HTML <- tklabel(f1,text="html") # tkgrid(tklabel(f1,text=" INPUT:"),entrylabel_TXT,entrylabel_XML,entrylabel_XMLDrama,entrylabel_XMLNoTitles,entrylabel_HTML,columnspan=1) tkgrid(tklabel(f1,text=" "),entry_TXT,entry_XML,entry_XMLDrama,entry_XMLNoTitles,entry_HTML,columnspan=1) # Tooltips for the above tk2tip(entrylabel_TXT, "Plain text files. \nIf your corpus does not contain diacritics, no encoding is needed. \nOtherwise, use ANSI for Windows, UTF-8 for Mac/Linux.") tk2tip(entrylabel_XML, "XML: all tags and TEI headers are removed.") tk2tip(entrylabel_XMLDrama, "XML for plays: all tags, TEI headers, \nand speakers' names between ... tags are removed.") tk2tip(entrylabel_XMLNoTitles, "XML contents only: all tags, TEI headers, \nand chapter/section (sub)titles between ... tags are removed.") tk2tip(entrylabel_HTML, "HTML headers, menus, links and other tags are removed.") tkgrid(tklabel(f1,text=" ")) # blank line for aesthetic purposes # next row: LANGUAGE # entry_ENG <- tkradiobutton(f1) entry_EN2 <- tkradiobutton(f1) entry_EN3 <- tkradiobutton(f1) entry_POL <- tkradiobutton(f1) entry_LAT <- tkradiobutton(f1) entry_LA2 <- tkradiobutton(f1) entry_FRA <- tkradiobutton(f1) entry_GER <- tkradiobutton(f1) entry_HUN <- tkradiobutton(f1) entry_ITA <- tkradiobutton(f1) entry_DUT <- tkradiobutton(f1) entry_SPA <- tkradiobutton(f1) # tkconfigure(entry_ENG,variable=corpus.lang,value="English") tkconfigure(entry_EN2,variable=corpus.lang,value="English.contr") tkconfigure(entry_EN3,variable=corpus.lang,value="English.all") tkconfigure(entry_LAT,variable=corpus.lang,value="Latin") tkconfigure(entry_LA2,variable=corpus.lang,value="Latin.corr") tkconfigure(entry_POL,variable=corpus.lang,value="Polish") tkconfigure(entry_FRA,variable=corpus.lang,value="French") tkconfigure(entry_GER,variable=corpus.lang,value="German") tkconfigure(entry_HUN,variable=corpus.lang,value="Hungarian") tkconfigure(entry_ITA,variable=corpus.lang,value="Italian") tkconfigure(entry_DUT,variable=corpus.lang,value="Dutch") tkconfigure(entry_SPA,variable=corpus.lang,value="Spanish") # entrylabel_ENG <- tklabel(f1,text=" English ") entrylabel_POL <- tklabel(f1,text=" Polish ") entrylabel_LAT <- tklabel(f1,text=" Latin ") entrylabel_FRA <- tklabel(f1,text=" French ") entrylabel_GER <- tklabel(f1,text=" German ") entrylabel_HUN <- tklabel(f1,text=" Hungarian ") entrylabel_ITA <- tklabel(f1,text=" Italian ") entrylabel_EN2 <- tklabel(f1,text="English (contr.)") entrylabel_EN3 <- tklabel(f1,text=" English (ALL) ") entrylabel_LA2 <- tklabel(f1,text="Latin (u/v > u) ") entrylabel_DUT <- tklabel(f1,text=" Dutch ") entrylabel_SPA <- tklabel(f1,text=" Spanish ") # tkgrid(tklabel(f1,text="LANGUAGE: "),entrylabel_ENG,entrylabel_EN2,entrylabel_EN3,entrylabel_LAT,entrylabel_LA2) tkgrid(tklabel(f1,text=" "),entry_ENG,entry_EN2,entry_EN3,entry_LAT,entry_LA2) tkgrid(tklabel(f1,text=" "),entrylabel_POL,entrylabel_HUN,entrylabel_FRA,entrylabel_ITA,entrylabel_SPA) tkgrid(tklabel(f1,text=" "),entry_POL,entry_HUN,entry_FRA,entry_SPA,entry_ITA) tkgrid(tklabel(f1,text=" "),entrylabel_DUT,entrylabel_GER) tkgrid(tklabel(f1,text=" "),entry_DUT,entry_GER) tkgrid(tklabel(f1,text=" ")) # blank line for aesthetic purposes # Tooltips for the above tk2tip(entrylabel_ENG, "Plain English: contractions and \ncompound words are split") tk2tip(entrylabel_POL, "Plain Polish: contractions and \ncompound words are split") tk2tip(entrylabel_EN2, "Modified English: \ncontractions are not split") tk2tip(entrylabel_EN3, "Further Modified English: contractions \nand compound words are not split") tk2tip(entrylabel_LAT, "Plain Latin: U and V \ntreated as distinct letters") tk2tip(entrylabel_FRA, "Plain French: contractions and \ncompound words are split") tk2tip(entrylabel_GER, "Plain German: contractions and \ncompound words are split") tk2tip(entrylabel_HUN, "Plain Hungarian: contractions and \ncompound words are split") tk2tip(entrylabel_ITA, "Plain Italian: contractions and \ncompound words are split") tk2tip(entrylabel_LA2, "Modified Latin: U and V \nboth treated as U") tk2tip(entrylabel_DUT, "Plain Dutch: contractions and \ncompound words are split") tk2tip(entrylabel_SPA, "Plain Castilian: contractions and \ncompound words are split") # next row: TEXT FEATURES entry_W <- tkradiobutton(f2) entry_L <- tkradiobutton(f2) cb_NGRAMS <- tkcheckbutton(f2) entry_NGRAMSIZE <- tkentry(f2,textvariable=ngram.size,width="8") # tkconfigure(entry_W,variable=analyzed.features,value="w") tkconfigure(entry_L,variable=analyzed.features,value="c") # entrylabel_W <- tklabel(f2,text="words") entrylabel_L <- tklabel(f2,text="chars") entrylabel_NGRAMSIZE <- tklabel(f2,text="ngram size") # tkgrid(tklabel(f2,text=" FEATURES:"),entrylabel_W,entrylabel_L,entrylabel_NGRAMSIZE) tkgrid(tklabel(f2,text=" "),entry_W,entry_L,entry_NGRAMSIZE) # Tooltips for the above tk2tip(entrylabel_W, "Select this to work on words") tk2tip(entrylabel_L, "Select this to work on characters \n(does not make much sense unless you use ngrams)") tk2tip(entrylabel_NGRAMSIZE, "State your n for n-grams \nto work on word/char clusters of n") tkgrid(tklabel(f2,text=" ")) # blank line for aesthetic purposes # next row: MFW SETTINGS # entry_MFW_MIN <- tkentry(f2,textvariable=mfw.min,width="8") entry_MFW_MAX <- tkentry(f2,textvariable=mfw.max,width="8") entry_MFW_INCR <- tkentry(f2,textvariable=mfw.incr,width="8") entry_START_AT <- tkentry(f2,textvariable=start.at,width="8") # entrylabel_MFW_MIN <- tklabel(f2,text="Minimum") entrylabel_MFW_MAX <- tklabel(f2,text="Maximum") entrylabel_MFW_INCR <- tklabel(f2,text="Increment") entrylabel_START_AT <- tklabel(f2,text="Start at freq. rank") # tkgrid(tklabel(f2,text="MFW SETTINGS:"),entrylabel_MFW_MIN,entrylabel_MFW_MAX,entrylabel_MFW_INCR,entrylabel_START_AT) tkgrid(tklabel(f2,text=" "),entry_MFW_MIN,entry_MFW_MAX,entry_MFW_INCR,entry_START_AT) tkgrid(tklabel(f2,text=" ")) # blank line for aesthetic purposes # Tooltips for the above tk2tip(entrylabel_MFW_MIN, "Set the minimum number of most frequent words. \nThe script will conduct its first analysis for \nthe number of words specified here") tk2tip(entrylabel_MFW_MAX, "Set the maximum number of most frequent words. \nThe script will conduct its final analysis for \nthe number of words specified here") tk2tip(entrylabel_MFW_INCR, "Set the increment added to \nthe minimum number of most frequent \nwords for each subsequent analysis.") tk2tip(entrylabel_START_AT, "Set the number of words from the top of \nthe frequency list to skip in the analysis.") # next row: CULLING # cb_DEL_PRON <- tkcheckbutton(f2) # entry_CUL_MIN <- tkentry(f2,textvariable=culling.min,width="8") entry_CUL_MAX <- tkentry(f2,textvariable=culling.max,width="8") entry_CUL_INCR <- tkentry(f2,textvariable=culling.incr,width="8") entry_CUT_OFF <- tkentry(f2,textvariable=mfw.list.cutoff,width="8") tkconfigure(cb_DEL_PRON,variable=delete.pronouns) # entrylabel_CUL_MIN <- tklabel(f2,text="Minimum") entrylabel_CUL_MAX <- tklabel(f2,text="Maximum") entrylabel_CUL_INCR <- tklabel(f2,text="Increment") entrylabel_CUT_OFF <- tklabel(f2,text="List Cutoff") cblabel_DEL_PRON <- tklabel(f2,text="Delete pronouns") # tkgrid(tklabel(f2,text=" CULLING:"),entrylabel_CUL_MIN,entrylabel_CUL_MAX, entrylabel_CUL_INCR,entrylabel_CUT_OFF,cblabel_DEL_PRON) tkgrid(tklabel(f2,text=" "),entry_CUL_MIN,entry_CUL_MAX,entry_CUL_INCR,entry_CUT_OFF,cb_DEL_PRON) tkgrid(tklabel(f2,text=" ")) # blank line for aesthetic purposes # next row: LISTS & FILES # cb_FREQS <- tkcheckbutton(f2) cb_LISTS <- tkcheckbutton(f2) cb_INTFILES <- tkcheckbutton(f2) cb_MYFILES <- tkcheckbutton(f2) # tkconfigure(cb_FREQS,variable=use.existing.freq.tables) tkconfigure(cb_LISTS,variable=use.existing.wordlist) tkconfigure(cb_INTFILES,variable=interactive.files) tkconfigure(cb_MYFILES,variable=use.custom.list.of.files) # cblabel_FREQS <- tklabel(f2,text="Existing frequencies") cblabel_LISTS <- tklabel(f2,text="Existing wordlist") cblabel_INTFILES <- tklabel(f2,text="Select files manually") cblabel_MYFILES <- tklabel(f2,text="List of files") # tkgrid(tklabel(f2,text=" VARIOUS:"),cblabel_FREQS,cblabel_LISTS,cblabel_INTFILES,cblabel_MYFILES) tkgrid(tklabel(f2,text=" "),cb_FREQS,cb_LISTS,cb_INTFILES,cb_MYFILES) # Tooltips for the above tk2tip(entrylabel_CUL_MIN, "State the minimum culling setting. \n0 means no words are omitted from the analysis. \n50 means a word needs to appear in \nat least 50% of the texts to be included in the analysis. \n100 means that only words appearing in all the texts \nwill be included in the analysis") tk2tip(entrylabel_CUL_MAX, "State the maximum culling setting. \n0 means no words are omitted from the analysis. \n50 means a word needs to appear in \nat least 50% of the texts to be included in the analysis. \n100 means that only words appearing in all the texts \nwill be included in the analysis") tk2tip(entrylabel_CUL_INCR, "State the increment added to the minimum culling \nsetting for each subsequent analysis.") tk2tip(entrylabel_CUT_OFF, "Set the maximum size of the word frequency table. \nAnything above 5000 requires patience and a fast computer") tk2tip(cblabel_DEL_PRON, "Select if you want to omit pronouns in the analysis. \nThis improves attribution in some languages") tk2tip(cblabel_FREQS, "Select to use the frequency lists generated by the previous analysis. \nThis speeds up the process dramatically. \nA very bad idea if you've just changed your selection of texts!") tk2tip(cblabel_LISTS, "Select to use the wordlist generated by \nthe previous analysis or a custom wordlist.") tk2tip(cblabel_INTFILES, "Select this to manually select files \nrather than use the entire corpus. \nMake sure that \"Existing frequencies\" is unchecked!") tk2tip(cblabel_MYFILES, "Select this if you want to use custom list \nof files to be loaded; the list should be stored \nin \"files_to_analyze.txt\", and the entries delimited \nwith spaces, tabs, or newlines.") tkgrid(tklabel(f2,text=" ")) # blank line for aesthetic purposes # next row: STATISTICS # entry_CA <- tkradiobutton(f3) entry_MDS <- tkradiobutton(f3) entry_PCA1 <- tkradiobutton(f3) entry_PCA2 <- tkradiobutton(f3) entry_CONS_TREE <- tkradiobutton(f3) entry_CONSS <- tkentry(f3,textvariable=consensus.strength,width="8") # tkconfigure(entry_CA,variable=analysis.type,value="CA") # cluster.analysis tkconfigure(entry_MDS,variable=analysis.type,value="MDS") # multidimensional.scaling tkconfigure(entry_PCA1,variable=analysis.type,value="PCV") # pca.covariance.table tkconfigure(entry_PCA2,variable=analysis.type,value="PCR") # pca.correlation.table tkconfigure(entry_CONS_TREE,variable=analysis.type,value="BCT") # make.consensus.tree # entrylabel_CA <- tklabel(f3,text="Cluster Analysis") entrylabel_MDS <- tklabel(f3,text="MDS") entrylabel_PCA1 <- tklabel(f3,text="PCA (cov.)") entrylabel_PCA2 <- tklabel(f3,text="PCA (corr.)") entrylabel_CONS_TREE <- tklabel(f3,text="Consensus Tree") entrylabel_CONSS <- tklabel(f3,text="Consensus strength") # tkgrid(tklabel(f3,text=" STATISTICS:"),entrylabel_CA,entrylabel_MDS,entrylabel_PCA1,entrylabel_PCA2) tkgrid(tklabel(f3,text=" "),entry_CA,entry_MDS,entry_PCA1,entry_PCA2) tkgrid(tklabel(f3,text=" "),entrylabel_CONS_TREE,entrylabel_CONSS) tkgrid(tklabel(f3,text=" "),entry_CONS_TREE,entry_CONSS) # Tooltips for the above tk2tip(entrylabel_CA, "Select to perform Cluster Analysis of Delta distance table. \nThis only makes sense if there is a single iteration \n(or only a few), so set MFW_MIN and MFW_MAX \nto equal values, which in turn makes the MFW_INCR setting immaterial. \nThen do the same for your culling settings.") tk2tip(entrylabel_MDS, "Select to perform Multidimensional Scaling of Delta distance table. \nThis only makes sense if there is a single iteration \n(or only a few), so set MFW_MIN and MFW_MAX \nto equal values, which in turn makes the MFW_INCR setting immaterial. \nThen do the same for your culling settings.") tk2tip(entrylabel_PCA1, "Select to perform Principal Components Analysis based on a covariance matrix of Delta distance table. \nThis only makes sense if there is a single iteration (or only a few), so set MFW_MIN and MFW_MAX \nto equal values, which in turn makes the MFW_INCR setting immaterial. \nThen do the same for your culling settings.") tk2tip(entrylabel_PCA2, "Select to perform Principal Components Analysis based on a correlation matrix of Delta distance table. \nThis only makes sense if there is a single iteration (or only a few), so set MFW_MIN and MFW_MAX \nto equal values, which in turn makes the MFW_INCR setting immaterial. \nThen do the same for your culling settings.") tk2tip(entrylabel_CONS_TREE, "Select to perform multiple iterations of Cluster Analysis of Delta distance table \nresulting in a Bootstrap COnsensus Tree. This only makes sense \nif you have at least three valid iterations, so set MFW_MIN and MFW_MAX, \nand/or CUL_MIN and CUL_MAX to different values.") tk2tip(entrylabel_CONSS, "Select to set the consensus strength for the Bootstrap Tree. \nOnly makes sense if you select that option is checked above. \nLegal values are from 0.4 (40% underlying CA graphs need to agree \non a given connection) to 1 (all underlying CA graphs need to agree).") # next row: DISTANCES # entry_CD <- tkradiobutton(f3) entry_AL <- tkradiobutton(f3) entry_ED <- tkradiobutton(f3) entry_ES <- tkradiobutton(f3) entry_MH <- tkradiobutton(f3) entry_CB <- tkradiobutton(f3) entry_EU <- tkradiobutton(f3) # tkconfigure(entry_CD,variable=distance.measure,value="CD") tkconfigure(entry_AL,variable=distance.measure,value="AL") tkconfigure(entry_ED,variable=distance.measure,value="ED") tkconfigure(entry_ES,variable=distance.measure,value="ES") tkconfigure(entry_MH,variable=distance.measure,value="MH") tkconfigure(entry_CB,variable=distance.measure,value="CB") tkconfigure(entry_EU,variable=distance.measure,value="EU") # entrylabel_CD <- tklabel(f3,text="Classic Delta") entrylabel_AL <- tklabel(f3,text="Argamon's Delta") entrylabel_ED <- tklabel(f3,text="Eder's Delta") entrylabel_ES <- tklabel(f3,text="Eder's Simple") entrylabel_MH <- tklabel(f3,text="Manhattan") entrylabel_CB <- tklabel(f3,text="Canberra") entrylabel_EU <- tklabel(f3,text="Euclidean") # tkgrid(tklabel(f3,text=" DISTANCES:"),entrylabel_CD,entrylabel_AL,entrylabel_ED,entrylabel_ES) tkgrid(tklabel(f3,text=" "),entry_CD,entry_AL,entry_ED,entry_ES) tkgrid(tklabel(f3,text=" "),entrylabel_MH,entrylabel_CB,entrylabel_EU) tkgrid(tklabel(f3,text=" "),entry_MH,entry_CB,entry_EU) tkgrid(tklabel(f3,text=" ")) # blank line for aesthetic purposes # Tooltips for the above tk2tip(entrylabel_CD, "Select the Classic Delta measure as developed by Burrows.") tk2tip(entrylabel_AL, "Select Argamon's Linear Delta (based on Euclidean principles).") tk2tip(entrylabel_ED, "Select Eder's Delta (explanation and mathematical equation: TBA).") tk2tip(entrylabel_ES, "Select Eder's Simple measure (explanation and mathematical equation: TBA).") tk2tip(entrylabel_MH, "Select Manhattan Distance (obvious and well documented).") tk2tip(entrylabel_CB, "Select Canberra Distance (risky, but sometimes amazingly good).") tk2tip(entrylabel_EU, "Select Euclidean Distance (basic and the most *natural*).") # next row: SAMPLING entry_SAMP <- tkradiobutton(f4) entry_RAND <- tkradiobutton(f4) entry_NOSAMP <- tkradiobutton(f4) tkconfigure(entry_SAMP, variable=sampling, value="normal.sampling") tkconfigure(entry_RAND, variable=sampling, value="random.sampling") tkconfigure(entry_NOSAMP, variable=sampling, value="no.sampling") entry_SAMPLESIZE <- tkentry(f4,textvariable=sample.size,width="10") entry_SIZE <- tkentry(f4,textvariable=length.of.random.sample,width="10") entrylabel_SAMP <- tklabel(f4,text="Normal sampling") entrylabel_RAND <- tklabel(f4,text="Random sampling") entrylabel_NOSAMP <- tklabel(f4,text="No sampling") entrylabel_SAMPLESIZE <- tklabel(f4, text="Sample size") entrylabel_SIZE <- tklabel(f4,text="Random sample size") # Position and display sampling parameters on the grid: tkgrid(entrylabel_NOSAMP,entrylabel_SAMP, entrylabel_RAND) tkgrid(entry_NOSAMP, entry_SAMP, entry_RAND) tkgrid(tklabel(f4,text=" "),entrylabel_SAMPLESIZE, entrylabel_SIZE) tkgrid(tklabel(f4,text=" "),entry_SAMPLESIZE, entry_SIZE) tkgrid(tklabel(f4,text=" ")) # blank line for aesthetic purposes # Tooltips for the above tk2tip(entrylabel_SAMP, "Specify whether the texts in the corpus should be divided in equal-sized samples.") tk2tip(entrylabel_SAMPLESIZE, "Specify the size for the samples (expressed in words). \nOnly relevant when normal sampling is switched on.") tk2tip(entrylabel_RAND, "When the analyzed texts are significantly unequal in length, \nit is not a bad idea to prepare samples as randomly chosen *bags of words*. \nIf this option is switched on, the desired size of a sample should be indicated.") tk2tip(entrylabel_SIZE, "Specify the random sample size. \nOnly relevant when random sampling is switched on.") tk2tip(entrylabel_NOSAMP, "No internal sampling will be performed: entire texts are considered as samples.") # next row: OUTPUT # cb_SCRN <- tkcheckbutton(f5) cb_PDF <- tkcheckbutton(f5) cb_JPG <- tkcheckbutton(f5) cb_EMF <- tkcheckbutton(f5) cb_PNG <- tkcheckbutton(f5) cb_PLOT.RESET <- tkcheckbutton(f5) entry_COLOR <- tkradiobutton(f5) entry_GRAY <- tkradiobutton(f5) entry_BW <- tkradiobutton(f5) entry_LABELS <-tkradiobutton(f5) entry_POINTS <-tkradiobutton(f5) entry_BOTH <-tkradiobutton(f5) entry_CLASSIC <- tkradiobutton(f5) entry_LOADINGS <- tkradiobutton(f5) entry_TECHNICAL <- tkradiobutton(f5) entry_SYMBOLS <- tkradiobutton(f5) cb_TITLE <- tkcheckbutton(f5) cb_HORIZ <- tkcheckbutton(f5) cb_TABLESAVE <- tkcheckbutton(f5) cb_FEATURESAVE <- tkcheckbutton(f5) cb_FREQSAVE <-tkcheckbutton(f5) tkconfigure(entry_LABELS, variable=text.id.on.graphs, value="labels") tkconfigure(entry_POINTS, variable=text.id.on.graphs, value="points") tkconfigure(entry_BOTH, variable=text.id.on.graphs, value="both") entry_MARGINS <- tkentry(f5,textvariable=add.to.margins,width="8") entry_OFFSET <- tkentry(f5,textvariable=label.offset,width="8") entry_PLOT.HEIGHT <- tkentry(f5,textvariable=plot.custom.height,width="8") entry_PLOT.WIDTH <- tkentry(f5,textvariable=plot.custom.width,width="8") entry_PLOT.FONT <- tkentry(f5,textvariable=plot.font.size,width="8") entry_PLOT.LINE <- tkentry(f5,textvariable=plot.line.thickness,width="8") tkconfigure(cb_SCRN,variable=display.on.screen) tkconfigure(cb_PDF,variable=write.pdf.file) tkconfigure(cb_JPG,variable=write.jpg.file) tkconfigure(cb_EMF,variable=write.emf.file) tkconfigure(cb_PNG,variable=write.png.file) tkconfigure(entry_COLOR, variable=colors.on.graphs, value="colors") tkconfigure(entry_GRAY, variable=colors.on.graphs, value="grayscale") tkconfigure(entry_BW, variable=colors.on.graphs, value="black") tkconfigure(cb_PLOT.RESET,variable=plot.options.reset) tkconfigure(entry_CLASSIC, variable=pca.visual.flavour, value="classic") tkconfigure(entry_LOADINGS, variable=pca.visual.flavour, value="loadings") tkconfigure(entry_TECHNICAL, variable=pca.visual.flavour, value="technical") tkconfigure(entry_SYMBOLS, variable=pca.visual.flavour, value="symbols") tkconfigure(cb_TITLE,variable=titles.on.graphs) tkconfigure(cb_HORIZ,variable=dendrogram.layout.horizontal) tkconfigure(cb_TABLESAVE,variable=save.distance.tables) tkconfigure(cb_FEATURESAVE,variable=save.analyzed.features) tkconfigure(cb_FREQSAVE,variable=save.analyzed.freqs) # cblabel_SCRN <- tklabel(f5, text=" Onscreen ") cblabel_PDF <- tklabel(f5,text=" PDF ") cblabel_JPG <- tklabel(f5,text=" JPG ") cblabel_EMF <- tklabel(f5,text=" EMF ") cblabel_PNG <- tklabel(f5,text=" PNG ") entrylabel_COLOR <- tklabel(f5,text="Colors") entrylabel_GRAY <- tklabel(f5,text="Grayscale") entrylabel_BW <- tklabel(f5,text="Black") cblabel_PLOT.RESET <- tklabel(f5,text="Set default") entrylabel_PLOT.HEIGHT <- tklabel(f5,text="Plot height") entrylabel_PLOT.WIDTH <- tklabel(f5,text="Plot width") entrylabel_PLOT.FONT <- tklabel(f5,text="Font size") entrylabel_PLOT.LINE <- tklabel(f5,text="Line width") entrylabel_LABELS <- tklabel(f5,text="Labels") entrylabel_POINTS <- tklabel(f5,text="Points") entrylabel_BOTH <- tklabel(f5,text="Both") entrylabel_MARGINS <- tklabel(f5,text="Margins") entrylabel_OFFSET <- tklabel(f5,text="Label offset") entrylabel_CLASSIC <- tklabel(f5,text=" Classic ") entrylabel_LOADINGS <- tklabel(f5,text=" Loadings ") entrylabel_TECHNICAL <- tklabel(f5,text=" Technical ") entrylabel_SYMBOLS <- tklabel(f5,text=" Symbols ") cblabel_TITLE <- tklabel(f5,text=" Titles ") cblabel_HORIZ <- tklabel(f5,text="Horizontal CA tree") cblabel_TABLESAVE <- tklabel(f5,text="Save distance table") cblabel_FEATURESAVE <- tklabel(f5,text="Save features") cblabel_FREQSAVE <- tklabel(f5,text="Save frequencies") # tkgrid(tklabel(f5,text=" GRAPHS:"), cblabel_SCRN,cblabel_PDF, cblabel_JPG,cblabel_EMF,cblabel_PNG,columnspan=5) tkgrid(tklabel(f5,text=" "), cb_SCRN,cb_PDF,cb_JPG,cb_EMF,cb_PNG,columnspan=5) tkgrid(tklabel(f5,text=" ")) # blank line for aesthetic purposes tkgrid(tklabel(f5,text=" PLOT AREA:"), cblabel_PLOT.RESET,entrylabel_PLOT.HEIGHT, entrylabel_PLOT.WIDTH,entrylabel_PLOT.FONT,entrylabel_PLOT.LINE,columnspan=5) tkgrid(tklabel(f5,text=" "), cb_PLOT.RESET,entry_PLOT.HEIGHT,entry_PLOT.WIDTH,entry_PLOT.FONT,entry_PLOT.LINE,columnspan=5) tkgrid(tklabel(f5,text=" "), tklabel(f5,text=" "),entrylabel_COLOR,entrylabel_GRAY,entrylabel_BW,cblabel_TITLE,columnspan=5) tkgrid(tklabel(f5,text=" "), tklabel(f5,text=" "),entry_COLOR,entry_GRAY,entry_BW,cb_TITLE,columnspan=5) tkgrid(tklabel(f5,text=" ")) # blank line for aesthetic purposes tkgrid(tklabel(f5,text=" PCA/MDS:"), entrylabel_LABELS, entrylabel_POINTS, entrylabel_BOTH, entrylabel_MARGINS,entrylabel_OFFSET, columnspan=5) tkgrid(tklabel(f5,text=" "), entry_LABELS, entry_POINTS, entry_BOTH, entry_MARGINS,entry_OFFSET, columnspan=5) tkgrid(tklabel(f5,text=" ")) # blank line for aesthetic purposes tkgrid(tklabel(f5,text="PCA FLAVOUR:"), entrylabel_CLASSIC, entrylabel_LOADINGS, entrylabel_TECHNICAL, entrylabel_SYMBOLS, columnspan=5) tkgrid(tklabel(f5,text=" "), entry_CLASSIC, entry_LOADINGS, entry_TECHNICAL, entry_SYMBOLS, columnspan=5) tkgrid(tklabel(f5,text=" VARIOUS:"), cblabel_HORIZ,cblabel_TABLESAVE,cblabel_FEATURESAVE,cblabel_FREQSAVE,columnspan=5) tkgrid(tklabel(f5,text=" "), cb_HORIZ,cb_TABLESAVE,cb_FEATURESAVE,cb_FREQSAVE,columnspan=5) tkgrid(tklabel(f5,text=" ")) # blank line for aesthetic purposes # Tooltips for the above tk2tip(cblabel_SCRN, "Select to have your diagram(s) displayed on R's standard graphics device.") tk2tip(cblabel_PDF, "Select to save your diagram(s) as (a) PDF file(s).") tk2tip(cblabel_JPG, "Select to save your diagram(s) as (a) JPG file(s).") tk2tip(cblabel_EMF, "Select to save your diagram(s) as (a) EMF file(s). \nOnly works in Windows.") tk2tip(cblabel_PNG, "Select to save your diagram(s) as (a) PNG file(s). \nProbably the best option for quality.") tk2tip(cblabel_PLOT.RESET, "Restore graphic parameters to safe default values \n(7x7 inches, 10 points font size, normal line width).") tk2tip(entrylabel_PLOT.HEIGHT, "Set custom plot height (in inches).") tk2tip(entrylabel_PLOT.WIDTH, "Set custom plot width (in inches).") tk2tip(entrylabel_PLOT.FONT, "Set custom font size (in points).") tk2tip(entrylabel_PLOT.LINE, "Set custom line width \n(in R units, 1 is default).") tk2tip(entrylabel_COLOR, "Select to have automatic color coding for tip labels by author.") tk2tip(entrylabel_GRAY, "Select to have automatic color coding (in grayscale) for tip labels by author.") tk2tip(entrylabel_BW, "Select to have a black & white graph.") tk2tip(cblabel_TITLE, "Select to have automatic titles (folder name, analysis type, \nMFW settings, analysis type etc.) on your graph(s).") tk2tip(entrylabel_LABELS, "Use only labels on MDS/PCA plots.") tk2tip(entrylabel_POINTS, "Use only points on MDS/PCA plots.") tk2tip(entrylabel_BOTH, "Use both labels and points on MDS/PCA plots.") tk2tip(entrylabel_MARGINS, "Set custom margin size \n(in percentage of plot area).") tk2tip(entrylabel_OFFSET, "Set custom offset between label and point \n(in percentage of plot area.") tk2tip(entrylabel_CLASSIC, "Original PCA visualization using (colored) sample names") tk2tip(entrylabel_LOADINGS, "Display PCA feature (word etc.) loadings.") tk2tip(entrylabel_TECHNICAL, "Technical grayscale PCA visualization, showing feature loadings as well as a PC barplot.\nPotentially useful for greyscale printing in traditional publications.") tk2tip(entrylabel_SYMBOLS, "Select to display the samples in your PCA with a group symbol (instead of their entire name).\n Potentially useful when dealing with lots of samples.") tk2tip(cblabel_HORIZ, "Select to have your Cluster Analysis graph oriented horizontally. \nProbably the better option for clarity.") tk2tip(cblabel_TABLESAVE, "Save final distance table(s) in separate text file(s).") tk2tip(cblabel_FEATURESAVE, "Save final feature (word, n-gram) list(s), e.g. the words actually used in the analysis.") tk2tip(cblabel_FREQSAVE, "Save frequency table(s) in separate text file(s).") # next row: the OK button # # button_1 <- tkbutton(tt,text=" OK ",command=push_OK,relief="groove") # tkbind(button_1,"",push_OK) # tkgrid(button_1,columnspan="10") tkgrid(tklabel(tt,text=" ")) # blank line (i.e., bottom margin) ########## repeat{ if(cancel_pause){ analyzed.features <- as.character(tclvalue(analyzed.features)) ngram.size <- as.numeric(tclvalue(ngram.size)) corpus.format <- as.character(tclvalue(corpus.format)) mfw.min <- as.numeric(tclvalue(mfw.min)) mfw.max <- as.numeric(tclvalue(mfw.max)) mfw.incr <- as.numeric(tclvalue(mfw.incr)) start.at <- as.numeric(tclvalue(start.at)) culling.min <- as.numeric(tclvalue(culling.min)) culling.max <- as.numeric(tclvalue(culling.max)) culling.incr <- as.numeric(tclvalue(culling.incr)) use.existing.freq.tables <- as.logical(as.numeric(tclvalue(use.existing.freq.tables))) use.existing.wordlist <- as.logical(as.numeric(tclvalue(use.existing.wordlist))) interactive.files <- as.logical(as.numeric(tclvalue(interactive.files))) use.custom.list.of.files <- as.logical(as.numeric(tclvalue(use.custom.list.of.files))) analysis.type <- as.character(tclvalue(analysis.type)) delete.pronouns <- as.logical(as.numeric(tclvalue(delete.pronouns))) display.on.screen <- as.logical(as.numeric(tclvalue(display.on.screen))) write.pdf.file <- as.logical(as.numeric(tclvalue(write.pdf.file))) write.jpg.file <- as.logical(as.numeric(tclvalue(write.jpg.file))) write.emf.file <- as.logical(as.numeric(tclvalue(write.emf.file))) write.png.file <- as.logical(as.numeric(tclvalue(write.png.file))) colors.on.graphs <- as.character(tclvalue(colors.on.graphs)) pca.visual.flavour <- as.character(tclvalue(pca.visual.flavour)) titles.on.graphs <- as.logical(as.numeric(tclvalue(titles.on.graphs))) dendrogram.layout.horizontal <- as.logical(as.numeric(tclvalue(dendrogram.layout.horizontal))) save.distance.tables <- as.logical(as.numeric(tclvalue(save.distance.tables))) save.analyzed.features <- as.logical(as.numeric(tclvalue(save.analyzed.features))) save.analyzed.freqs <- as.logical(as.numeric(tclvalue(save.analyzed.freqs))) sampling <- as.character(tclvalue(sampling)) sample.size <- as.numeric(tclvalue(sample.size)) length.of.random.sample <- as.numeric(tclvalue(length.of.random.sample)) mfw.list.cutoff <- as.numeric(tclvalue(mfw.list.cutoff)) distance.measure <- as.character(tclvalue(distance.measure)) corpus.lang <- as.character(tclvalue(corpus.lang)) consensus.strength <- as.numeric(tclvalue(consensus.strength)) plot.options.reset <- as.logical(as.numeric(tclvalue(plot.options.reset))) plot.custom.height <- as.numeric(tclvalue(plot.custom.height)) plot.custom.width <- as.numeric(tclvalue(plot.custom.width)) plot.font.size <- as.numeric(tclvalue(plot.font.size)) plot.line.thickness <- as.numeric(tclvalue(plot.line.thickness)) text.id.on.graphs <- as.character(tclvalue(text.id.on.graphs)) add.to.margins <- as.numeric(tclvalue(add.to.margins)) label.offset <- as.numeric(tclvalue(label.offset)) break } .Tcl("font delete myDefaultFont") } } # <-- here the option "interactive.mode.with.GUI == TRUE" is completed # ################################################# # GUI module explicit feliciter (Phew!) # ################################################# # ############################################################################# # Final settings (you are advised rather not to change them) # ############################################################################# # The chosen language option should be followed by an assignment of # the appropriate set of pronouns. The following code is responsible for it if(corpus.lang == "English") pronouns = eng.pronouns if(corpus.lang == "Polish") pronouns = pol.pronouns if(corpus.lang == "Latin") pronouns = lat.pronouns if(corpus.lang == "Latin.corr") pronouns = lat.pronouns if(corpus.lang == "French") pronouns = fra.pronouns if(corpus.lang == "German" ) pronouns = ger.pronouns if(corpus.lang == "Italian") pronouns = ita.pronouns if(corpus.lang == "Hungarian") pronouns = hun.pronouns if(corpus.lang == "Dutch") pronouns = dut.pronouns # Since it it not so easy to perform, say, 17.9 iterations, or analyze # 543.3 words, the code below rounds off all numerical variables to # the nearest positive integers, to prevent you from making silly jokes # with funny settings. (OK, it is still possible to crash the script in # more ways than one, but you will have to find them on your own). mfw.min = round(mfw.min) mfw.max = round(mfw.max) mfw.incr = round(mfw.incr) start.at = round(start.at) culling.min = round(culling.min) culling.max = round(culling.max) culling.incr = round(culling.incr) mfw.list.cutoff = round(mfw.list.cutoff) sample.size = round(sample.size) # resetting the default plot area (if an appropriate option has been chosen) if(plot.options.reset == TRUE) { plot.custom.height = 7 plot.custom.width = 7 plot.font.size = 10 plot.line.thickness = 1 plot.options.reset = FALSE } # If TXM compatibility mode has been chosen, other options need to be switched off if(txm.compatibility.mode == TRUE) { # checking if a frequency table has been passed from TXM to R if(exists("txm.generated.freq.table") == TRUE) { # inheriting the table from TXM frequencies.0.culling = t(variable.name) # transposing the table frequencies.0.culling = frequencies.0.culling[-1,] # set the variable use.existing.freq.tables to skip uploading corpus files use.existing.freq.tables == TRUE } else { cat("\n\n !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n", "Oops! To use TXM compatibility mode, you have to launch TXM first!\n", "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n") stop("Incorrect input data") } } # Finally, we want to save some of the variable values for later use; # they are automatically loaded into the GUI at the next run of the script. cat("",file="config.txt",append=F) var.name<-function(x) { if(is.character(x)==TRUE) { cat(paste(deparse(substitute(x))," = \"",x,"\"", sep=""),file="config.txt",sep="\n",append=T) } else { cat(paste(deparse(substitute(x)),x, sep=" = "),file="config.txt",sep="\n",append=T) } } var.name(corpus.format) var.name(corpus.lang) var.name(analyzed.features) var.name(ngram.size) var.name(mfw.min) var.name(mfw.max) var.name(mfw.incr) var.name(start.at) var.name(culling.min) var.name(culling.max) var.name(culling.incr) var.name(mfw.list.cutoff) var.name(delete.pronouns) var.name(analysis.type) var.name(use.existing.freq.tables) var.name(use.existing.wordlist) var.name(use.custom.list.of.files) var.name(consensus.strength) var.name(distance.measure) var.name(display.on.screen) var.name(write.pdf.file) var.name(write.jpg.file) var.name(write.emf.file) var.name(write.png.file) var.name(save.distance.tables) var.name(save.analyzed.features) var.name(save.analyzed.freqs) var.name(colors.on.graphs) var.name(titles.on.graphs) var.name(dendrogram.layout.horizontal) var.name(pca.visual.flavour) var.name(sampling) var.name(sample.size) var.name(length.of.random.sample) var.name(sampling.with.replacement) var.name(plot.custom.height) var.name(plot.custom.width) var.name(plot.font.size) var.name(plot.line.thickness) var.name(label.offset) var.name(add.to.margins) var.name(text.id.on.graphs) # ############################################################################# # no additional margin will be added on PCA/MDS plots, unless points & labels # are switched to be shown together if(text.id.on.graphs != "both") { label.offset = 0 } # if a chosen plot area is really large (and a bitmap output has been chosen), # a warning will appear if(write.jpg.file == TRUE || write.emf.file == TRUE || write.png.file == TRUE){ # if the desired height*width (at 300 dpi) exceeds 36Mpx if(300*plot.custom.width * 300*plot.custom.height > 36000000) { cat("\nYou have chosen a bitmap output format and quite a large plot area\n") cat("of", plot.custom.width, "by", plot.custom.height, "inches. Producing some", as.integer(300*plot.custom.width * 300*plot.custom.height / 1000000), "Megapixels will take a good while.\n\n") cat(" i - ignore this warning and continue with the current settings\n") cat(" p - use pdf format instead of a bitmap (default)\n") cat(" s - shrink the plot area to a reasonable size of 20x20 inches\n") cat(" a - abort the script\n") # reading from the prompt answer = readline("\n[i/p/s/a] ") if(tolower(answer) == "a") { stop("The script stopped by the user") } else if(tolower(answer) == "i") { cat("Okay (but honestly, do you really need such a large plot?)\n") } else if(tolower(answer) == "s") { cat("The plot area will be shrunken to 20x20 inches\n") plot.custom.width = 20 plot.custom.height = 20 } else { cat("Withdrawing from the bitmap output, performing pdf instead\n") write.jpg.file = FALSE write.emf.file = FALSE write.png.file = FALSE write.pdf.file = TRUE } } } # ############################################################################# # ################################################# # FUNCTIONS: # ################################################# # Function for combining single features (words # or characters) into n-grams, or strings of n elements; # e.g. character 2-grams of the sentence "This is a sentence" # are as follows: "th", "hi", "is", "s ", " i", "is", etc. # Required argument: name of the vector of words/chars # ################################################# make.ngrams = function(input.text) { txt = c() if(ngram.size > 1) { txt = input.text for(n in 2:ngram.size) { txt = paste(txt[1:(length(txt)-1)],input.text[n:length(input.text)]) } } else { # if n-gram size is set to 1, then nothing will happen txt = input.text } return(txt) } # ################################################# # The generic function for splitting a given input text into # single words (chains of characters delimited with # spaces or punctuation marks). Alternatively, # you can replace it with another rule. # Required argument: name of the text to be split. # ATTENTION: this is (almost) the only piece of coding in this script # that dependens on the operating system used # ################################################# split.into.words = function(input.text) { # splitting into units specified by regular expression; here, # all sequences between non-letter characters are assumed to be words: if(Sys.info()[["sysname"]] == "Windows") { ### Windows tokenized.text = c(unlist(strsplit(input.text, "\\W+|_+",perl=T))) } else { ### Linux, Mac tokenized.text = c(unlist(strsplit(input.text, "[^[:alpha:]]+"))) } return(tokenized.text) } # ################################################# # Function that carries out the necessary modifications # for feature selection: convert a sample into # the type of sequence needed (ngrams etc.) and # returns the new list of items # Argument: a vector of words (or chars) # ################################################# sample.to.features = function(sample){ # 1. for splitting a given input text into # single words (chains of characters delimited with # spaces or punctuation marks). Alternatively, # you can replace it with another rule. # # Splitting the sample into chars (if analyzed.features was set to "c") if(analyzed.features == "c") { sample = paste(sample, collapse=" ") sample = unlist(strsplit(sample,"")) } # 2. making n-grams (if an appropriate option has been chosen): if(ngram.size > 1) { sample = make.ngrams(sample) } return(sample) } # ################################################# # Function for splitting a given input text into # single words (chains of characters delimited with # spaces or punctuation marks). There is also an option # of splitting the text into characters and/or performing # splitting into n-grams (see above) # ################################################# parse.text = function(input.text) { # loading the file; optionally, fiddling with apostrophes and contractions: # # this is the standard procedure of splitting input texts if(corpus.lang != "English.contr" && corpus.lang != "English.all") { tokenized.text = split.into.words(input.text) } # if the Latin option with adjusting the v/u letters is on, # this smashes the distinction and converts both types to the letter u if(corpus.lang == "Latin.corr") { tokenized.text = gsub("v","u",tokenized.text) } # this code is used for English corpora only if(corpus.lang == "English.contr" || corpus.lang == "English.all") { # replacing non-ASCII apostrophes with simple ' (standard ASCII char) tokenized.text = gsub("’","'",input.text) # getting rid of contractions ('t, 's, 've, 'd, 'll, 'em, 'im) by replacing # their apostrophes with ^ (other apostrophes will not be replaced); # Of course, if your corpus is Cockney, you should edit the # "([tsdm]|ll|ve|em|im)" statement accordingly. tokenized.text = gsub("([[:alpha:]])'([tsdm]|ll|ve|em|im)\\b","\\1^\\2", tokenized.text) #' # adding spaces around dashes (to distinguish dashes and hyphens) tokenized.text = gsub("[-]{2,5}"," -- ",tokenized.text) # depending on which option was swithed on, either the contractions are # kept, or all the peculiarities, i.e. both contractions and hyphens if(corpus.lang == "English.contr") { tokenized.text=c(unlist(strsplit(tokenized.text,"[^[:alpha:]^]+"))) } if(corpus.lang == "English.all") { tokenized.text=c(unlist(strsplit(tokenized.text,"[^[:alpha:]^-]+"))) # trying to clean the remaining dashes: tokenized.text = gsub("^[-]+$","",tokenized.text) } } # trying to avoid empty strings: tokenized.text = tokenized.text[nchar(tokenized.text)>0] # trying to get rid of non-letter characters: tokenized.text = tokenized.text[grep("[^[:digit:]]",tokenized.text)] # sanity check for text length: abort if the current text is extremely # short or at least shorter than the specified sample size if (length(tokenized.text) < 10 || (sampling == "normal.sampling" && length(tokenized.text) < sample.size) || (sampling == "random.sampling" && length(tokenized.text) < sample.size)) { cat("\n\n",file, "\t", "This sample is too short!", "\n\n") setwd(".."); stop("Corpus error...") } # at this point, each text in the corpus has been tokenized # into an array of tokens which we can divide into samples samples.from.text = list() if (sampling == "normal.sampling"){ # initialize variables to sample the text text.length = length(tokenized.text) number.of.samples = floor(text.length/(sample.size)) cat(paste("\t", "- text length (in words): ", text.length, "\n", sep="")) cat(paste("\t", "- nr. of samples: ", number.of.samples, "\n", sep="")) cat(paste("\t", "- nr. of words dropped at the end of the file: ", text.length-(number.of.samples*sample.size), "\n", sep="")) # iterate over the samples: current.start.index = 1 for(sample.index in 1:number.of.samples) { current.sample = tokenized.text[current.start.index:(current.start.index+sample.size-1)] current.sample = sample.to.features(current.sample) # flush current sample: samples.from.text[[sample.index]] = current.sample # increment index for next iteration current.start.index = current.start.index + sample.size current.sample = c() } } else if(sampling == "random.sampling"){ # if random sampling was chosen, the text will be randomized and a sample of a given length will be excerpted current.sample = head(sample(tokenized.text, replace = sampling.with.replacement), length.of.random.sample) current.sample = sample.to.features(current.sample) samples.from.text[[1]] = current.sample } else if (sampling == "no.sampling"){ # entire texts will be used as a sample (regardless of its length) current.sample = tokenized.text current.sample = sample.to.features(current.sample) samples.from.text[[1]] = current.sample } return(samples.from.text) } # ################################################# # Function for adjusting different input formats: # xml (TEI) in two variants, html, and plain text files. # Required argument: name of the text to pre-process # ################################################# delete.markup = function(input.text) { if(corpus.format == "xml" || corpus.format == "xml.drama") { # getting rid of the TEI header (if it exists) if(length(grep("",input.text)) > 0) { input.text = input.text[-c(1:(grep("",input.text)))] } # the whole text into one (very) long line preprocessed.text = paste(input.text, collapse=" ") # getting rid of dramatis personae if(corpus.format == "xml.drama"){ preprocessed.text = gsub(".*?","",preprocessed.text) } # getting rid of comments and (editorial) notes preprocessed.text = gsub("","",preprocessed.text) # getting rid of all the remaining tags preprocessed.text = gsub("<.*?>","",preprocessed.text) } if(corpus.format == "html") { # getting rid of html header (if exists) if(length(grep(" 0) { input.text = input.text[-c(1:(grep("","",preprocessed.text) # getting rid of all the remaining tags preprocessed.text = gsub("<.*?>","",preprocessed.text) } else { preprocessed.text = input.text } return(preprocessed.text) } # ################################################# # Function for graph auto-coloring; depending on # the user's choice, it assigns colors or grayscale tones # to matching strings of characters in texts' names # (as a delimiter, the underscore character is used); # alternatively, all the labels can be marked black. # Required argument: a vector of text labels # Optional argument: col="colors" || "grayscale" || "back" # ################################################# assign.plot.colors = function(names.of.the.texts,col="colors") { if(col == "black") { colors.of.pca.graph = "black" } else { color.numeric.values = c(1) current.color = 1 # a loop for matching the subsequent strings of chars for(w in 2:length(names.of.the.texts)) { # if two samples have the same id, don't change the color if(gsub("_.*","",names.of.the.texts)[w] %in% gsub("_.*","",names.of.the.texts[1:(w-1)]) == TRUE) { find.color = which(gsub("_.*","",names.of.the.texts) == gsub("_.*","",names.of.the.texts)[w])[1] current.color = color.numeric.values[find.color] # if the samples differ, assign the next color (actually, the color's id) } else { current.color = max(color.numeric.values) + 1 } # append the recent color to the final vector of color values color.numeric.values = c(color.numeric.values, current.color) } # define a vector of available colors, if an appropriate option was chosen if(col == "colors") { available.colors = rep(c("red","green","blue","black","orange","purple", "darkgrey","brown","maroon4","mediumturquoise","gold4", "deepskyblue", "yellowgreen","grey","chartreuse4", "khaki", "navy", "palevioletred", "greenyellow", "darkolivegreen4", "chocolate4"),10) } # define a vector of gray tones, instead of colors if(col == "grayscale") { number.of.colors.required = max(color.numeric.values) available.colors = gray(seq(0,0.7,0.7/(number.of.colors.required-1))) } # produce the final vector of colors (or gray tones) colors.of.pca.graph = available.colors[c(color.numeric.values)] } return(colors.of.pca.graph) } # ################################################# # Function that finds out the coordinates # of scatterplots; it computes the extreme x and y # values, adds some margins, and optionally extends # the top margin if a plot uses sample labels # Required arguments: (1) a vector of x coordinates, # (2) a vector of y coordinates, optionally with names; # optional arguments: (3) additional margins (expressed # in % of actual plot area), (4) label offset (in %) # ################################################# define.plot.area = function(x.coord,y.coord,xymargins=2,v.offset=0) { # get horizontal plotting area (reasonable margins added on both sides): # (1) how long are the extreme samples' names left.label.length = nchar(names(x.coord)[(order(x.coord)[1])]) right.label.length = nchar(names(x.coord)[(order(x.coord,decreasing=T)[1])]) # (2) checking if the sample labels really exist if(length(left.label.length) == 0) { left.label.length = 0} if(length(right.label.length) == 0) { right.label.length = 0} # (3) x axis expansion (0.5% for each character of the extreme samples' names) left.expansion = left.label.length * 0.005 right.expansion = right.label.length * 0.005 # (4) size of the x axis x.axis.size = abs(max(x.coord) - min(x.coord)) # (5) finally, get both x coordinates min.x = min(x.coord) - (left.expansion + 0.01 * xymargins) * x.axis.size max.x = max(x.coord) + (right.expansion + 0.01 * xymargins) * x.axis.size # # get vertical plotting area (plus optional top margin): # (1) size of the y axis y.axis.size = abs(max(y.coord) - min(y.coord)) # (2) top margin (added to fit the samples' labels) top.offset = 0.005 * y.axis.size # (3) finally, get both y coordinates min.y = min(y.coord) - 0.01 * xymargins * y.axis.size max.y = max(y.coord) + (0.01 * label.offset + 0.01 * xymargins) * y.axis.size # plot.area = list(c(min.x, max.x), c(min.y, max.y)) return(plot.area) } # ################################################# # ################################################# # the module for loading a corpus from the text files; # it can be omitted if the frequency table already exists # (then "use.existing.freq.tables" should be set # to TRUE in the preamble of the script/GUI) # ################################################# # # Checking: (1) whether to produce a new frequency table or to use # the existing one; (2) whether the tables are stored in memory or # written into files. # If you have chosen using the existing table and it does not exist, # available, then your choice will be ignored and the table will be # created from scratch. # checking if an appropriate frequency table exists if(exists("frequencies.0.culling") == FALSE && file.exists("table_with_frequencies.txt") == FALSE ) { use.existing.freq.tables = FALSE } if(use.existing.freq.tables == TRUE) { if(exists("frequencies.0.culling")) { cat("\n", "using frequency table stored as variables...", "\n") } else { cat("\n", "reading file with frequency table...", "\n") frequencies.0.culling = t(read.table("table_with_frequencies.txt")) cat("\n", "frequency table loaded successfully", "\n\n") } # extracting names of the texts corpus.filenames = rownames(frequencies.0.culling) # # checking whether an existing wordlist should be used if (use.existing.wordlist == TRUE && file.exists("wordlist.txt") == TRUE){ cat("\n", "reading a wordlist from file...", "\n") mfw.list.of.all = scan("wordlist.txt",what="char",sep="\n") mfw.list.of.all = c(grep("^[^#]",mfw.list.of.all,value=TRUE)) # # adjusting the size of the frequency table according to the existing wordlist frequencies.0.culling = frequencies.0.culling[,colnames(frequencies.0.culling) %in% mfw.list.of.all] } else { # the wordlist will be created from the existing frequency tables mfw.list.of.all = colnames(frequencies.0.culling) # some comments into the file containing the wordlist cat("# This file contains the words that were used in the table", "# of frequencies uploaded from an external file. The current list", "# can be used for the next tasks, and for this purpose it can be", "# manually revised, edited, deleted, culled, etc.", "# You can either delete unwanted words, or mark them with \"#\"", "# -----------------------------------------------------------------", "", file="wordlist.txt", sep="\n") # the current wordlist into a file cat(mfw.list.of.all, file="wordlist.txt", sep="\n",append=F) } # if the existing table will not be used, then begin producing the new table } else { # # Retrieving the names of texts # # first, it's possible to choose the files manually if (interactive.files == TRUE) { setwd("corpus") corpus.filenames = basename(tk_choose.files(default = "", caption = "Select at least 2 files", multi = TRUE)) setwd("..") } else { # alternatively, one can use the files listed in "files_to_analyze.txt"; # the listed files can be separated by spaces, tabs, or newlines if(use.custom.list.of.files == TRUE && file.exists("files_to_analyze.txt") == TRUE) { # a message on the screen cat("\n") cat("external list of files will be used for uploading the corpus\n\n") # retrieving the filenames from a file corpus.filenames = scan("files_to_analyze.txt",what="char",sep="\n",quiet=T) # getting rid of spaces and/or tabs corpus.filenames = unlist(strsplit(corpus.filenames,"[ \t]+")) # checking whether all the files indicated on the list really exist if( length(setdiff(corpus.filenames,list.files("corpus"))) > 0 ){ # if not, then sent a message and list the suspicious filenames cat("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n") cat("the following files have not been found:\n") cat(setdiff(corpus.filenames, list.files("corpus")),"\n\n") cat("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n") # use only those files that match corpus.filenames = intersect(corpus.filenames, list.files("corpus")) } } else { corpus.filenames = list.files("corpus") } } # # Checking whether the required files and subdirectory exist if(file.exists("corpus")==FALSE) { cat("\n\n !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n", "Hey! The working directory should contain the subdirectory \"corpus\"\n", "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n") stop("Corpus prepared incorrectly") } if(length(corpus.filenames) < 2 && sampling !="normal.sampling") { cat("\n\n !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n", "Ho! The subdirectory \"corpus\" should contain at least two text samples!\n", "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n") stop("Corpus prepared incorrectly") } # # loading the corpus from individual text files loaded.corpus = list() if (sampling == "normal.sampling"){ cat(paste("Performing sampling (using sample size = ", sample.size," words)\n", sep="")) } else if(sampling == "random.sampling"){ cat(paste("Performing random sampling (using random sample size = ", " words)\n", sep="")) } else if (sampling == "no.sampling"){ cat(paste("Performing no sampling (using entire text as sample)", "\n", sep="")) } else { stop("Exception raised: something is wrong with the sampling parameter you have specified...") } # temporarily change the working directory to the corpus directory setwd("corpus") for (file in corpus.filenames) { cat(paste("Loading ", file, "\t", "...", "\n", sep="")) # loading the next file from the list "corpus.filenames" current.file = tolower(scan(file,what="char",sep="\n", quiet=T)) # delete xml/html markup (if applicable) current.file = delete.markup(current.file) # deleting punctuation, splitting into words: samples.from.text = parse.text(current.file) # appending the current text to the virtual corpus if (sampling == "normal.sampling"){ for (index in 1:length(samples.from.text)){ # add the parsed sample to the corpus (and remove the filename extension) loaded.corpus[[paste(gsub("(\\.txt$)||(\\.xml$)||(\\.html$)||(\\.htm$)", "",file), "-", index, sep="")]] = samples.from.text[[index]] } } else { loaded.corpus[[file]] = unlist(samples.from.text) } } # reset the working directory setwd("..") cat(paste("Total nr. of samples in the corpus: ", length(loaded.corpus), "\n")) # # # the directory with corpus must contain enough texts; # if the number of text samples is lower than 2, the script will abort. if( (length(corpus.filenames) < 2) & (sampling=="no.sampling") ) { cat("\n\n","your corpus folder seems to be empty!", "\n\n") stop("corpus error") } # # # We need a list of the most frequent words used in the current corpus, # in descending order, without frequencies (just a list of words). It can be # either loaded from a file (then set the option "use.existing.wordlist=TRUE"), # or created by the code provided below: # if (use.existing.wordlist == TRUE && file.exists("wordlist.txt") == TRUE) { cat("\n", "reading a wordlist from file...", "\n") # loading the wordlist fil e, changing to lowercase mfw.list.of.all = tolower(scan("wordlist.txt",what="char",sep="\n")) # getting rid of commented lines in the wordlist file mfw.list.of.all = c(grep("^[^#]",mfw.list.of.all,value=TRUE)) } else { # Extracting all the words used in the corpus # wordlist.of.loaded.corpus = c() for (file in 1 : length(loaded.corpus)) { # loading the next sample from the list "corpus.filenames" current.text = loaded.corpus[[file]] # putting samples together: wordlist.of.loaded.corpus = c(wordlist.of.loaded.corpus, current.text) # cat(names(loaded.corpus[file]),"\t","tokenized successfully", "\n") } # # preparing a sorted frequency list of the whole set mfw.list.of.all = sort(table(c(wordlist.of.loaded.corpus)),decreasing=T) # if the whole list is long, then cut off the tail, as specified in the GUI # by the cutoff value if (length(mfw.list.of.all) > mfw.list.cutoff) { mfw.list.of.all = mfw.list.of.all[1:mfw.list.cutoff] } # the only thing we need are words ordered by frequency (no frequencies) mfw.list.of.all = names(mfw.list.of.all) # # some comments into the file containing the wordlist cat("# This file contains the words that were used for building the table", "# of frequencies. It can be also used for further tasks, and for this", "# purpose it can be manually revised, edited, deleted, culled, etc.", "# You can either delete unwanted words, or mark them with \"#\"", "# -----------------------------------------------------------------------", "", file="wordlist.txt", sep="\n") # the current wordlist into the "wordlist.txt" file cat(mfw.list.of.all, file="wordlist.txt", sep="\n",append=T) # } # <----- conditional expr. "use.existing.wordlist" terminates here # # blank line on the screen cat("\n") # # ################################################# # FUNCTION: make.parallel.frequency.lists() # preparing a huge table with all the frequencies (> mwf.list.cutoff). # Two arguments are required -- a vector with sample names # and a specified variable where the corpus is stored (in a list) # ################################################# # make.parallel.frequency.lists = function(sample.names,current.corpus) { freq.list.of.all.the.samples = c() freq.list.of.current.sample = c() for (sample.name in sample.names) { # loading the next sample from the list "sample.names" current.sample = current.corpus[[sample.name]] # preparing the frequency list of the current text raw.freq = table(current.sample) * 100 / length(current.sample) # adjusting the frequency list to the main MFW list obtained above freq.list.of.current.sample = raw.freq[mfw.list.of.all] # taking the names (sc. words) from the main MFW list names(freq.list.of.current.sample) = mfw.list.of.all # and inserting the current sample into the general frequency table freq.list.of.all.the.samples = rbind(freq.list.of.all.the.samples, freq.list.of.current.sample) # a short message on the screen: # cat(file, "\t", "excerpted successfully", "\n") cat(".") } # adjusting names of the rows (=samples) rownames(freq.list.of.all.the.samples) = c(sample.names) # the result of the function return(freq.list.of.all.the.samples) } # # preparing a huge table of all the frequencies for the whole corpus frequencies.0.culling = make.parallel.frequency.lists(names(loaded.corpus),loaded.corpus) # all NA values will be adjusted to 0 frequencies.0.culling[which(is.na(frequencies.0.culling))] = 0 # # getting rid of zero values (this might happen in random sampling # or when custom wordlist are used) frequencies.0.culling = frequencies.0.culling[,grep("FALSE",(colSums(frequencies.0.culling))==0)] # # # # writing the frequency tables to text files (they can be re-used!) write.table(t(frequencies.0.culling), file="table_with_frequencies.txt", sep="\t", row.names=TRUE, col.names=TRUE) } # <----- conditional expr. "use.existing.freq.tables" terminates here # # # ################################################# # the module for loading the corpus terminates here # ################################################# # ################################################# # MAIN PROGRAM; the main loop is below # ################################################# # saving the original mfw.max value in mfw.max.original # this is useful for graph subtitles mfw.max.original = mfw.max # the general counter for various purposes: initialization number.of.current.iteration = 0 # load the ape library; make an empty bootstrap.results list # this will be executed only if the bootstrap option is checked if (analysis.type == "BCT") { library(ape) bootstrap.list = list() } # ################################################# # module for culling (THE MAIN LOOP IN THE PROGRAM) # ################################################# # testing if desired culling settings are acceptable; # if too large, it is set to maximum possible if(culling.max > 100) { culling.max = 100 } # if too small, it is set to 0 (i.e. minimal value) if(culling.min < 0) { culling.min = 0 } # avoiding infinite loops if(culling.incr <= 1) { culling.incr = 10 } # ################################################# for(j in (culling.min/culling.incr):(culling.max/culling.incr)) { current.culling = j * culling.incr # the beginning of the culling procedure raw.list.after.culling = c() # extracting non-zero values the frequency table. nonzero.values = frequencies.0.culling > 0 # counting non-zero values for (y in 1: length(nonzero.values[1,])) { raw.list.after.culling = c(raw.list.after.culling, (length(grep("TRUE",nonzero.values[,y])) / length(nonzero.values[,y])) >= current.culling/100 ) } # a raw culling list has no word-identification; let’s change it: names(raw.list.after.culling) = colnames(frequencies.0.culling) # a simple sequence of words which have not been culled list.of.words.after.culling = c(names(raw.list.after.culling[grep("TRUE",raw.list.after.culling)])) # procedure for deleting pronouns if (delete.pronouns == TRUE) { list.of.words.after.culling = list.of.words.after.culling[!(list.of.words.after.culling %in% pronouns)] } # the above list-of-not-culled to be applied to the wordlist: table.with.all.freqs = frequencies.0.culling[,c(list.of.words.after.culling)] # the names of the samples are passed to the frequency table if(use.existing.freq.tables == FALSE) { rownames(table.with.all.freqs) = names(loaded.corpus) } # ################################################# # culling is done, but we are still inside the main loop # starting the frequency list at frequency rank set in option start.at above table.with.all.freqs = table.with.all.freqs[,start.at:length(table.with.all.freqs[1,])] # Testing if the desired MFW number is acceptable, # if MFW too large, it is set to maximum possible. if(mfw.max > length(table.with.all.freqs[1,])) { mfw.max = length(table.with.all.freqs[1,]) } # if too small, it is set to 1 (i.e., minimal value) if(mfw.min < 1) { mfw.min = 1 } # if culling is too strong, sometimes strange things may happen; let’s block it if(mfw.min > mfw.max) { mfw.min = mfw.max } # MFW set to mfw.max for a while (it will change later on) mfw = mfw.max cat("\n\n") cat("culling @ ", current.culling,"\t","available words ",mfw.max,"\n") # ################################################# # z-scores calcutations # ################################################# if((analysis.type == "CA") || (analysis.type == "BCT") || (analysis.type == "MDS")){ # calculating z-scores (a message on the screen) cat("Calculating z-scores... \n\n") # Entropy distance: experimental, but entirely available yet # (the results do not really differ than for typical word frequencies) # #A = t(t(table.with.all.freqs + 1) / colSums(table.with.all.freqs + 1)) #B =t(t(log(table.with.all.freqs + 2)) / -(colSums(A * log(A)))) #table.with.all.freqs = B # # calculating z-scores table.with.all.zscores = scale(table.with.all.freqs) table.with.all.zscores = table.with.all.zscores[,] } # ################################################# # the internal loop starts here (for i = mfw.min : mfw.max) # ################################################# # a short message on the screen about distance calculations (when appropriate): if((analysis.type == "CA") || (analysis.type == "BCT") || (analysis.type == "MDS")){ if(distance.measure == "CD") { cat("Calculating classic Delta distances... \n") } if(distance.measure == "AL") { cat("Calculating Argamon's Delta distances... \n") } if(distance.measure == "ED") { cat("Calculating Eder's Delta distances... \n") } if(distance.measure == "ES") { cat("Calculating Eder's Simple distances... \n") } if(distance.measure == "MH") { cat("Calculating Manhattan distances... \n") } if(distance.measure == "CB") { cat("Calculating Canberra distances... \n") } if(distance.measure == "EU") { cat("Calculating Euclidean distances... \n") } } cat("MFW used: ") for(i in (mfw.min/mfw.incr):(mfw.max/mfw.incr)) { mfw = i * mfw.incr # for safety reasons, if MFWs > words in samples if(mfw > length(list.of.words.after.culling) ) { mfw = length(list.of.words.after.culling) } # the general counter for various purposes number.of.current.iteration = number.of.current.iteration + 1 # the current task (number of MFW currently analyzed) echoed on the screen cat(mfw, " ") # ################################################# # module for calculating distances between texts # ################################################# if((analysis.type == "CA") || (analysis.type == "BCT") || (analysis.type == "MDS")){ # calculating Delta distances to a distance matrix if(distance.measure == "CD") { distance.name.on.graph = "Classic Delta distance" distance.name.on.file = "Classic Delta" distance.table = as.matrix(dist(table.with.all.zscores[,1:mfw], method="manhattan")) / mfw } # calculating Argamon’s "Linear Delta" if(distance.measure == "AL") { distance.name.on.graph = "Argamon’s Delta distance" distance.name.on.file = "Argamon’s Delta" distance.table = as.matrix(dist(table.with.all.zscores[,1:mfw], method="euclidean")) / mfw } # calculating Delta distances with Eder’s modifications if(distance.measure == "ED") { distance.name.on.graph = "Eder’s Delta distance" distance.name.on.file = "Eder’s Delta" zscores.plus.e.value = t(t(table.with.all.zscores[,1:mfw])*((1+mfw:1)/mfw)) distance.table = as.matrix(dist(zscores.plus.e.value,method="manhattan")) } # calculating Eder’s Simple distance to a distance matrix if(distance.measure == "ES") { distance.table = as.matrix(dist(sqrt(table.with.all.freqs[,1:mfw]),method="manhattan")) distance.name.on.graph = "Eder’s Simple distance" distance.name.on.file = "Eder’s Simple" } # calculating Manhattan distance to a distance matrix if(distance.measure == "MH") { distance.name.on.graph = "Manhattan distance" distance.name.on.file = "Manhattan" distance.table = as.matrix(dist(table.with.all.freqs[,1:mfw],method="manhattan")) } # calculating Canberra distance to a distance matrix if(distance.measure == "CB") { distance.name.on.graph = "Canberra distance" distance.name.on.file = "Canberra" distance.table = as.matrix(dist(table.with.all.freqs[,1:mfw],method="canberra")) } # calculating Euclidean distance to a distance matrix if(distance.measure == "EU") { distance.name.on.graph = "Euclidean distance" distance.name.on.file = "Euclidean" distance.table = as.matrix(dist(table.with.all.freqs[,1:mfw],method="euclid")) } # replaces the names of the samples (the extension ".txt" is cut off) rownames(distance.table)=gsub("(\\.txt$)||(\\.xml$)||(\\.html$)||(\\.htm$)", "",rownames(table.with.all.freqs)) colnames(distance.table)=gsub("(\\.txt$)||(\\.xml$)||(\\.html$)||(\\.htm$)", "",rownames(table.with.all.freqs)) } # ################################################# # a tiny module for graph auto-coloring # ################################################# names.of.the.texts = gsub("(\\.txt)||(\\.xml)||(\\.html)||(\\.htm)","",rownames(table.with.all.freqs)) colors.of.pca.graph = assign.plot.colors(names.of.the.texts,col=colors.on.graphs) # ################################################# # preparing the graphs # ################################################# # The name of a given method will appear in the title of the graph # (if the appropriate option was chosen), and will be pasted into # a filename of the current job. First, variables are initiated... name.of.the.method = "" short.name.of.the.method = "" mfw.info = mfw plot.current.task = function() {NULL} # getting rid of redundant start.at information if(start.at == 1) { start.at.info = "" } else { start.at.info = paste("Started at",start.at) } # getting rid of redundant pronoun information if(delete.pronouns == TRUE) { pronouns.info = paste("Pronouns deleted") } else { pronouns.info = "" } # getting rid of redundant culling information if(culling.min == culling.max) { culling.info = culling.min } else { culling.info = paste(culling.min,"-",culling.max,sep="") } # prepares a dendrogram for the current MFW value for CA plotting if(analysis.type == "CA") { name.of.the.method = "Cluster Analysis" short.name.of.the.method = "CA" if(dendrogram.layout.horizontal == TRUE) { dendrogram.margins = c(5,4,4,8)+0.1 } else { dendrogram.margins = c(8,5,4,4)+0.1 } # the following task will be plotted plot.current.task = function(){ par(mar=dendrogram.margins) ######################################################################## ######################################################################## # color graphs, but using different clustering algorithm if(new.flavor.of.cluster.analysis == TRUE) { plot(nj(distance.table), font=1, tip.color=colors.of.pca.graph) # alternatively, the traditional approach: } else { ######################################################################## # clustering the distances stored in the distance.table clustered.data = hclust(as.dist(distance.table),"ward") # reordering the vector of colors to fit the order of clusters colors.on.dendrogram = colors.of.pca.graph[clustered.data$order] # converting the clusters into common dendrogram format tree.with.clusters = as.dendrogram(clustered.data,hang=0) # now, preparing the procedure for changing leaves' color attributes # (this snippet is taken from "help(dendrapply)" and slightly adjusted) local({ colLab <<- function(n) { if(is.leaf(n)) { a <- attributes(n) i <<- i+1 attr(n, "nodePar") <- c(a$nodePar, lab.col = mycols[i], pch = NA) } n } mycols = colors.on.dendrogram i <- 0 }) # adding the attributes to subsequent leaves of the dendrogram, # using the above colLab(n) function dendrogram.with.colors = dendrapply(tree.with.clusters, colLab) # finally, ploting the whole stuff plot(dendrogram.with.colors, main = graph.title, horiz = dendrogram.layout.horizontal) # OBSOLETE PROCEDURE of plotting the dendrograms # plot(as.dendrogram(hclust(as.dist(distance.table),"ward"), hang=0), # main = graph.title, # horiz = dendrogram.layout.horizontal) if(dendrogram.layout.horizontal == TRUE) { title(sub=graph.subtitle) } else { title(sub=graph.subtitle, outer=TRUE, line=-1) } } }} # prepares a 2-dimensional plot (MDS) for plotting if(analysis.type == "MDS") { name.of.the.method = "Multidimensional Scaling" distance.name.on.graph = "" distance.name.on.file = "" short.name.of.the.method = "MDS" mds.results = cmdscale(distance.table,eig=TRUE) xy.coord = mds.results$points[,1:2] if(text.id.on.graphs == "both") { label.coord = cbind(mds.results$points[,1],(mds.results$points[,2] + (0.01*label.offset* abs(max(mds.results$points[,2]) - min(mds.results$points[,2]))))) } else { label.coord = xy.coord } plot.area = define.plot.area(mds.results$points[,1],mds.results$points[,2], xymargins=add.to.margins, v.offset=label.offset) plot.current.task = function(){ if(text.id.on.graphs == "points" || text.id.on.graphs == "both") { plot(xy.coord, type="p", ylab="", xlab="", xlim=plot.area[[1]],ylim=plot.area[[2]], main = graph.title, sub = graph.subtitle, col = colors.of.pca.graph, lwd = plot.line.thickness) } if(text.id.on.graphs == "labels") { plot(xy.coord, type="n", ylab="", xlab="", xlim=plot.area[[1]],ylim=plot.area[[2]], main = graph.title, sub = graph.subtitle, col = colors.of.pca.graph, lwd = plot.line.thickness) } if(text.id.on.graphs == "labels" || text.id.on.graphs == "both") { text(label.coord, rownames(label.coord), col=colors.of.pca.graph) } axis(1,lwd=plot.line.thickness) axis(2,lwd=plot.line.thickness) box(lwd=plot.line.thickness) } } # prepares Principal Components Analysis (PCA) for plotting if(analysis.type == "PCV" || analysis.type == "PCR") { # set some string information variables name.of.the.method = "Principal Components Analysis" short.name.of.the.method = "PCA" distance.name.on.file = "PCA" if(analysis.type == "PCV") { pca.results = prcomp(table.with.all.freqs[,1:mfw]) distance.name.on.graph = "Covariance matrix" } else if(analysis.type == "PCR") { pca.results = prcomp(table.with.all.freqs[,1:mfw], scale=TRUE) distance.name.on.graph = "Correlation matrix" } # get the variation explained by the PCs: expl.var = round(((pca.results$sdev^2)/sum(pca.results$sdev^2)*100),1) PC1_lab = paste("PC1 (",expl.var[1],"%)", sep="") PC2_lab = paste("PC2 (",expl.var[2],"%)", sep="") xy.coord = pca.results$x[,1:2] if(text.id.on.graphs == "both") { label.coord = cbind(pca.results$x[,1],(pca.results$x[,2] + (0.01*label.offset* abs(max(pca.results$x[,2]) - min(pca.results$x[,2]))))) } else { label.coord = xy.coord } plot.area = define.plot.area(pca.results$x[,1],pca.results$x[,2], xymargins=add.to.margins, v.offset=label.offset) # # ############################################################################## # define the plotting function needed: plot.current.task = function(){ if (pca.visual.flavour == "classic"){ if(text.id.on.graphs == "points" || text.id.on.graphs == "both") { plot(xy.coord, type="p", xlim=plot.area[[1]],ylim=plot.area[[2]], xlab="",ylab=PC2_lab, main = graph.title,sub = paste(PC1_lab,"\n",graph.subtitle), col=colors.of.pca.graph, lwd=plot.line.thickness) } if(text.id.on.graphs == "labels") { plot(xy.coord, type="n", xlim=plot.area[[1]],ylim=plot.area[[2]], xlab="",ylab=PC2_lab, main = graph.title,sub = paste(PC1_lab,"\n",graph.subtitle), col=colors.of.pca.graph, lwd=plot.line.thickness) } abline(h=0, v=0, col = "gray60",lty=2) if(text.id.on.graphs == "labels" || text.id.on.graphs == "both") { text(label.coord, rownames(pca.results$x), col=colors.of.pca.graph) } axis(1,lwd=plot.line.thickness) axis(2,lwd=plot.line.thickness) box(lwd=plot.line.thickness) } else if(pca.visual.flavour == "loadings"){ biplot(pca.results, col=c("grey70", "black"), cex=c(0.7, 1), xlab="", ylab=PC2_lab, main=paste(graph.title, "\n\n", sep=""), sub=paste(PC1_lab,"\n",graph.subtitle, sep=""),var.axes=FALSE) } else if(pca.visual.flavour == "technical"){ layout(matrix(c(1,2), 2, 2, byrow = TRUE), widths=c(3,1)) biplot(pca.results, col=c("black", "grey40"), cex=c(1, 0.9), xlab="", ylab=PC2_lab, main=paste(graph.title, "\n\n", sep=""), sub=paste(PC1_lab,"\n",graph.subtitle, sep=""),var.axes=FALSE) abline(h=0, v=0, col = "gray60",lty=3) # add the subpanel to the right row = mat.or.vec(nc=ncol(pca.results$x),nr=1) for (i in 1:ncol(row)){row[,i]<-"grey45"} # paint the first two PCS black -- i.e. the ones actually plotted row[,1]<-"black" row[,2]<-"black" barplot(expl.var, col = row, xlab = "Principal components", ylab = "Proportion of variance explained (in %)") # set a horizontal dashed line, indicating the psychological 5% barrier abline(h=5, lty=3) } else if(pca.visual.flavour == "symbols"){ # determine labels involved labels = c() for (c in rownames(pca.results$x)){ labels = c(labels, gsub("_.*","",c)) } COOR = data.frame(pca.results$x[,1:2], LABEL=labels) labels<-c(levels(COOR$LABEL)) # visualize library(lattice) sps <- trellis.par.get("superpose.symbol") sps$pch <- 1:length(labels) trellis.par.set("superpose.symbol", sps) ltheme <- canonical.theme(color = FALSE) lattice.options(default.theme = ltheme) pl<-xyplot(data=COOR, x=PC2~PC1, xlab=paste(PC1_lab,"\n",graph.subtitle, sep=""), ylab=PC2_lab, groups=LABEL, sub="", key=list(columns=2, text=list(labels), points=Rows(sps, 1:length(labels))), panel=function(x, ...){ panel.xyplot(x, ...) panel.abline(v=0, lty=3) panel.abline(h=0, lty=3) }) plot(pl) } } } # prepares a list of dendrogram-like structures for a bootstrap consensus tree # (the final tree will be generated later, outside the main loop of the script) if (analysis.type == "BCT") { mfw.info = paste(mfw.min,"-",mfw.max.original, sep="") name.of.the.method = "Bootstrap Consensus Tree" short.name.of.the.method = "Consensus" # calculates the dendrogram for current settings current.bootstrap.results = as.phylo(hclust(as.dist(distance.table),"ward")) # current.bootstrap.results = nj(as.dist(distance.table)) # adds the current dendrogram to the list of all dendrograms bootstrap.list[[number.of.current.iteration]] = current.bootstrap.results } # establishing the text to appear on the graph (unless "notitle" was chosen) if(ngram.size > 1) { ngram.value = paste(ngram.size,"-grams", sep="") } else { ngram.value = "" } # if(titles.on.graphs == TRUE) { graph.title = paste(basename(getwd()),"\n",name.of.the.method) if(analysis.type == "BCT") { graph.subtitle = paste(mfw.info," MF",toupper(analyzed.features)," ",ngram.value," Culled @ ",culling.info,"%\n", pronouns.info," ",distance.name.on.graph," Consensus ",consensus.strength," ",start.at.info, sep="") } else { graph.subtitle = paste(mfw.info," MF",toupper(analyzed.features)," ",ngram.value," Culled @ ",culling.info,"%\n", pronouns.info," ",distance.name.on.graph," ",start.at.info, sep="") } } else { graph.title = "" graph.subtitle = "" } # name of the output file (strictly speaking: basename) for graphs graph.filename = paste(basename(getwd()),short.name.of.the.method,mfw.info, "MFWs_Culled",culling.info,pronouns.info, distance.name.on.file,"C",consensus.strength,start.at.info, sep="_") if(analysis.type == "BCT") { graph.filename = paste(basename(getwd()),short.name.of.the.method,mfw.info, "MFWs_Culled",culling.info,pronouns.info, distance.name.on.file,"C",consensus.strength,start.at.info, sep="_") } else { graph.filename = paste(basename(getwd()),short.name.of.the.method,mfw.info, "MFWs_Culled",culling.info,pronouns.info, distance.name.on.file,start.at.info, sep="_") } # ################################################# # plotting # ################################################# # The core code for the graphic output (if bootstrap consensus tree # is specified, the plot will be initiated later) if(analysis.type != "BCT") { if(display.on.screen == TRUE) { plot.current.task() } if(write.pdf.file == TRUE) { pdf(file = paste(graph.filename,"%03d",".pdf",sep=""), width=plot.custom.width,height=plot.custom.height, pointsize=plot.font.size) plot.current.task() dev.off() } if(write.jpg.file == TRUE) { jpeg(filename = paste(graph.filename,"%03d",".jpg",sep=""), width=plot.custom.width,height=plot.custom.height, unit="in",res=300,pointsize=plot.font.size) plot.current.task() dev.off() } if(write.emf.file == TRUE) { if(Sys.info()[["sysname"]] == "Windows") { ### Windows win.metafile(filename = paste(graph.filename,"%03d",".emf",sep=""), width=plot.custom.width,height=plot.custom.height, res=300,pointsize=plot.font.size) plot.current.task() dev.off() } else { ### Linux, Mac cat("\n\n\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n") cat("EMF file format is not supported by", Sys.info()[["sysname"]],"\n") cat("You're suggested to try again with PNG, JPG or PDF.\n") cat("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n") } } if(write.png.file == TRUE) { png(filename = paste(graph.filename,"%03d",".png",sep=""), width=plot.custom.width,height=plot.custom.height, unit="in",res=300,pointsize=plot.font.size) plot.current.task() dev.off() } } ################################################## # writing distance table(s) to a file (if an appropriate option has been chosen) if(save.distance.tables == TRUE) { distance.table.filename = paste("distance_table_",mfw,"mfw_",current.culling,"c.txt",sep="") write.table(file=distance.table.filename, distance.table) } # writing the words (or features) actually used in the analysis if(save.analyzed.features == TRUE) { cat(colnames(table.with.all.freqs[,1:mfw]), file=paste("features_analyzed_",mfw,"mfw_",current.culling,"c.txt",sep=""), sep="\n") } # writing the frequency table that was actually used in the analysis if(save.analyzed.freqs == TRUE) { write.table(table.with.all.freqs[,1:mfw], file=paste("frequencies_analyzed_",mfw,"mfw_",current.culling,"c.txt",sep="")) } } # <-- the internal loop for(i) returns here # ################################################# # blank line on the screen cat("\n") } # <-- the main loop for(j) returns here # ################################################ # bootstrap visualization if(analysis.type == "BCT") { # as above, the task to be plotted is saved as a function if(length(bootstrap.list) <= 2) { cat("\n\nSORRY, BUT YOU ARE EXPECTING TOO MUCH...!\n\n", "There should be at least 3 iterations to make a consensus tree\n\n") } else { plot.current.task = function(){ plot(consensus(bootstrap.list, p=consensus.strength), type="u", font=1, lab4ut="axial", tip.color = colors.of.pca.graph) title (main = graph.title) title (sub = graph.subtitle) } # The core code for the graphic output... Yes, you are right: you’ve seen # the same lines above. Instead of blaming us, write better code yourself # and let us know. if(display.on.screen == TRUE) { plot.current.task() } if(write.pdf.file == TRUE) { pdf(file = paste(graph.filename,"%03d",".pdf",sep=""), width=plot.custom.width,height=plot.custom.height, pointsize=plot.font.size) plot.current.task() dev.off() } if(write.jpg.file == TRUE) { jpeg(filename = paste(graph.filename,"%03d",".jpg",sep=""), width=plot.custom.width,height=plot.custom.height, unit="in",res=300,pointsize=plot.font.size) plot.current.task() dev.off() } if(write.emf.file == TRUE) { win.metafile(filename=paste(graph.filename,"%03d",".emf",sep=""), width=plot.custom.width,height=plot.custom.height, res=300,pointsize=plot.font.size) plot.current.task() dev.off() } if(write.png.file == TRUE) { png(filename = paste(graph.filename,"%03d",".png",sep=""), width=plot.custom.width,height=plot.custom.height, unit="in",res=300,pointsize=plot.font.size) plot.current.task() dev.off() } }} # ################################################# # final cleaning cat("\n") cat("removing most of the variables... \n") cat("type ls() if you want to see what was not removed\n") cat("if you are going to change the corpus, clean all: rm(list=ls())\n") cat("\n") # a list of variables not to be removed do.not.remove = c("table.with.all.zscores", "table.with.all.freqs", "frequencies.0.culling", "distance.table", variables.not.to.be.removed) # removing the variables which are not on the above list list.of.variables = ls() #rm(list=list.of.variables[!(list.of.variables %in% do.not.remove)]) # ################################################# # TO DO: # Christof: a loop for different start.at values # Fotis: custom list of files does not work for sample labels # add a warning if picture size is going to exceed 25Mpx # applicable scenarios: # # 1. MDS, 100, En, pdf, png # 2. MDS, 1000, 100% culling, En, pdf, png # 3. PCA, corr., 100 # 4. Cons.Tree # ... # common wordlist when the number of full-sized novels >100 # (picking the first 100 by chance? extracting randomly 1M words?, # extract a number of words, say 50k, from each novel?) # color dendrograms: cf. the snippet on desktop # dendrograms: ward, complete, average, nj # rooted consensus trees? # the code for MDS and PCA in different flavors deserves some comments!