############################################################################## # stylometric classification test, version 0.4.5 (beta release) # # Copyright (C) 2009-2012 by Maciej Eder & Jan Rybicki. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # ############################################################################ # # To cite this script in publications you might use: # Eder, M., Rybicki, J. (2011). Stylometry with R. In "Digital Humanities # 2011: Conference Abstracts." Stanford University, Stanford, CA, pp. 308-11. # # Contact with the authors: # Maciej Eder # Jan Rybicki # # ############################################################################ # # # # ver. 0.4.5, 2012/07/12 --> further GUI rearrangements; code cleaned; # bugs fixed # ver. 0.4.4, 2012/05/31 --> the core code rewritten, the I/II set division # abandoned, GUI remodelled, tooltips added, # different input formats supported (xml etc.), # config options loaded from external file; # the code forked into (1) this script, supporting # machine-learning methods (Delta, SVM, NSC, ...), # (2) a script for explanatory analyses # ver. 0.4.3, 2012/04/28 --> feature selection (word and letter n-grams) # ver. 0.4.2, 2012/02/10 --> three ways of splitting words in English; # bugs fixed; GUI code rearranged and simplified # ver. 0.4.1, 2011/06/27 --> better output; better text files uploading, # new options for culling and ranking of candidates # ver. 0.4.0, 2011/06/20 --> the official world-premiere (Stanford, CA) # ver. 0.3.9b, 2011/06/1 --> the code simplified; minor cleaning # ver. 0.3.9, 2011/05/21 --> uploading wordlist from external source; # thousands of improvements; the code simplified # ver. 0.3.8, 2010/11/01 --> skip top frequency words option added # ver. 0.3.7, 2010/11/01 --> better graphs; attempt at better graph layout # ver. 0.3.6, 2010/07/31 --> more graphic options; dozens of improvements # ver. 0.3.5, 2010/07/19 --> module for colour graphs; module for PCA # ver. 0.3.4, 2010/07/12 --> module for uploading corpus files improved # ver. 0.3.3, 2010/06/03 --> the core code simplified and improved (faster!) # ver. 0.3.2, 2010/05/10 --> reordered GUI; minor cleaning # ver. 0.3.1, 2010/05/10 --> the z-scores module improved # ver. 0.3.0, 2009/12/26 --> better counter of "good guesses"; option for # randomly generated samples; minor improvements # ver. 0.2.99, 2009/12/25 --> platform-independent outputfile saving # ver. 0.2.98, 2009/12/24 --> GUI thoroughly integrated with initial variables # ver. 0.2.10, 2009/11/28 --> corrected MFW display in graph, more analysis # description in outputfile # ver. 0.2.9, 2009/11/22 --> auto graphs for MSD and CA # ver. 0.2.8a, 2009/11/21 --> remodelled GUI # ver. 0.2.8, 2009/11/20 --> GUI: radiobuttons, checkbuttons # ver. 0.2.7, 2009/11/19 --> language-determined pronoun selection # ver. 0.2.6, 2009/11/18 --> dialog box (GUI) # ver. 0.2.5, 2009/11/16 --> module for different distance measures; # thousands of improvements (I/O, interface, etc.) # ver. 0.2.2, 2009/10/25 --> numerous little improvements; deleting pronouns # ver. 0.2.1, 2009/08/23 --> module for culling; module for bootstrapping # ver. 0.2.0, 2009/08/23 --> module for uploading plain text files # ver. 0.1.9, 2009/07/31 --> numerous improvements, the code simplified # ver. 0.1.4, 2009/07/19 --> loop for different MFW settings # ver. 0.0.1, 2009/07/01 --> some bash and awk scripts translated into R ############################################################################## ####### GENERAL SETTINGS (GUI/TEXT-MODE) ################################### # If you wish to use a simple yet effective graphical interface (GUI), # just set the following option to TRUE, otherwise switch this option to FALSE # and edit manually the rest of variables (see below). # If you switch this option on, the values indicated in the following sections # will serve as default for the GUI (you can adapt them to your needs) interactive.mode.with.GUI = FALSE ####### TEXT- AND LANGUAGE-DEPENDENT SETTINGS #################### # format of corpus files; available choices are: # "plain", "xml", "xml.drama", "xml.notitles", "html" corpus.format = "plain" # how many MFW should be taken into analysis (if mfw.min value = max.mfw, # then no multiple iterations will be computed) # start.at option enables skipping top frequency words: you should # indicate the desired start position of your list (in most cases you will # probably prefer setting it to 1) mfw.min = 100 mfw.max = 100 mfw.incr = 100 start.at = 1 # culling rate specifies the number of texts in a corpus in which a given word # must be found in order to be included in the analysis. Thus, a 100% culling # rate limits the analysis to words that appear at least once in every text # in the corpus; at a 50% culling rate, a word is included into the analysis # when it appears in at least half of the texts in the corpus; a 0% culling # rate (or no culling) means that no words are omitted. # about min=max: see above culling.min = 0 culling.max = 0 culling.incr = 20 # Deleting pronouns (this is independent of the culling procedure). # If deleting pronouns option is switched to TRUE, choose one language # of the following: English, Polish, Latin, French, German, Italian, Hungarian # (the editable lists of pronouns are available below; see: advanced settings). # Additionally, there are a few variants of language settings available: # English.contr, English.all, and Latin.corr. Their meaning is as follows: # "English.contr": treats the contractions as signle words, i.e. strings # such as "don't", "you've" etc. will not be split into two words. # "English.all": keeps the contractions (as above), and also prevents # from splitting compound words (mother-in-law, double-decker, etc.) # "Latin.corr": since some editions do not distinguish the letters v/u, # this option provides a consistent conversion to "u" in each text. delete.pronouns = FALSE corpus.lang = "English.all" # Selection of features. In classical approaches, frequencies of the most # frequent words (MFW) are used as basis for multidimensional analyses. # It has been argued, however, that also other features are worh considering, # especially word and/or letter n-grams. The general concept of n-gram # is to combine a string of single words/letters into a sequence of n # elements. Given a sample sentence "This is a simple example", the letter # 2-grams are as follows: "th", "hi", "is", "s ", " i", "is", "s ", " a", "a ", # " s", "si", "im", "mp", etc. The same sentence split into word 3-grams: # "this is a", "is a simple", "a simple example". # Another question is whether it really increases the accuracy of attribution; # further reading: Eder, M. (2011). Style-markers in authorship attribution: # A cross-language study of the authorial fingerprint, "Studies in Polish # Linguistics" 6: 101-16. # Two types of n-grams are available: letters (option "l"), and words ("w"). analyzed.features = "w" ngram.size = 1 ####### MATHEMATICAL SETTINGS (CLASSIFICATION METHOD) ############# # method of classification: choose one of the options described below # Delta ("delta"), k-nearest neighbor classification ("knn"), # Naive Bayes classification ("naivebayes"), Nearest Shrunken Centroids # ("nsc"), or Support Vectors Machines ("svm") classification.method = "knn" ####### MATHEMATICAL SETTINGS (DISTANCE MEASURE) ################# # Strictly speaking, the choice of an appropriate distance measure # is the core of the statistical procedure provided by this script. # (However, the distance measures do not apply to the PCA method) # Although this choice is not easy, some of the following measures # seem to be more suitable for linguistic purposes than others. # On theoretical grounds, Euclidean Distance and Manhattan # Distance should be avoided in stylometry. Canberra Distance is quite # troublesome but effective e.g. for Latin (it should be combined with # careful culling settings and a limited number of MFW taken into analysis). # For English, usually Classic Delta is a good choice. A theoretical # explanation of the measures implemented in this script is forthcoming (?). # # The available distance measures (choose ONE) are as follows: # "CD" --> Classic Delta as developed by Burrows # "AL" --> Argamon's Linear Delta (based on Euclidean principles) # "ED" --> Eder's Delta (explanation and mathematical equation: soon) # "ES" --> Eder's Simple (explanation and mathematical equation: soon) # "MH" --> Manhattan Distance (obvious and well documented) # "CB" --> Canberra Distance (risky, but sometimes amazingly good) # "EU" --> Euclidean Distance (basic, and the most "natural") distance.measure = "CD" ######## VISUALIZATION METHODS, LOGS, REPORTS, DISPLAY OPTIONS ############ # Theoretically, one can switch on every option at once (not much sense # in that). This will work, because the script will treat the last switched on # option as your actual choice. You may also want to switch off each option # (then, the logfile will be the only output). # Cluster Analysis of Delta distance table; # this option makes sense if there is only a single iteration (or only a few). # Multidimensional Scaling of Delta distance table; # as above: although possible with multiple iterations, it is unwise. # Principal Component Analysis: either using covariance matrix, # or correlation matrix -- the latter might be more convincing # Bootstrap procedure: multiple iterations will build a consensus tree # ATTENTION: this option requires the ape library!!! #cluster.analysis = FALSE #multidimensional.scaling = FALSE #pca.covariance.table = FALSE #pca.correlation.table = FALSE #make.consensus.tree = FALSE consensus.strength=0.5 # Delta is always active: output is directed to a file. You may specify # the number of final ranking candidates to be displayded (at least 1) number.of.candidates = 3 # Do you want to display the graph on the screen? # Do you want to write the graph directly to a graphics file? Which format? #display.on.screen = TRUE #write.pdf.file = FALSE #write.jpg.file = FALSE #write.emf.file = FALSE # Windows only #write.png.file = FALSE # Do you want the graphs coloured? Do you want titles on the graphs? #use.color.graphs = TRUE #titles.on.graphs = TRUE # Layout of dendrogram: horizontal/vertical (Cluster Analysis only) #dendrogram.layout.horizontal = TRUE # Report the number of correct guesses for each iteration (written to # the log file). Ranking of the least unlikely candidates in the log file. how.many.correct.attributions = TRUE final.ranking.of.candidates = TRUE ####### ADVANCED SETTINGS (FOR EXPERTS ONLY) ######################## # Normally, the script is computing a big table of thousands # of word frequencies. This is a non-trivial and time-consuming task. # If done once, there is no need to waste time and do it again, because # the tables are also written into output files. To retrieve all the word # frequencies from existing files, switch this option to TRUE. # BUT it MUST be set to FALSE when you switch corpora in the same R session! use.existing.freq.tables = TRUE # Some people like to see what's going on, and to be able to revise/edit # the list of words for analysis. To meet their wishes, the script # saves the list into a separate output file. You can either delete as many # words as you want from this file, or mark the unwanted words with "#" # (just like these comments are marked). Switching the following option on # prevents the script from overwriting the file, and provides that the wordlist # is loaded from there. use.existing.wordlist = TRUE # Usually, it is recommended to cut off the tail of the word-list; # if you do not want to cut the list, then the variable may be set to an # absurdly big number, or to "mfw.list.cutoff = mfw.list.of.all" # (and then you are advised to use a fast computer) mfw.list.cutoff = 5000 # How the z-scores should be calculated: # if the variable is set to FALSE, then the z-scores are relying # on the primary set only (this should be better in most cases; after all, # this is the classical solution used by Burrows and Hoover). # Otherwise, the scaling is based on all the values # in the primary and the secondary sets. z.scores.of.all.samples = FALSE # The both talbes of frequencies are build using the pre-prepared word # list of the whole I set. Alternatively, one might want to prepare # this list of both sets. Similarily culling: it can be calcutated either # on the I set, or on both sets reference.wordlist.of.all.samples = FALSE culling.of.all.samples = TRUE # file with the final ranking of Delta results (log file) outputfile = "final_results.txt" # pronouns (and other words) to be deleted pol.pronouns = c("ci", "ciebie", "cię", "go", "ich", "im", "ja", "ją", "je", "jego", "jej", "jemu", "ma", "mą", "me", "mego", "mej", "memu", "mi", "mną", "mnie", "moi", "moich", "moim", "moimi", "moja", "moją", "moje", "mojego", "mojej", "mojemu", "mój", "mu", "my", "mych", "mym", "mymi", "nam", "nami", "nas", "nią", "nich", "nie", "niego", "niej", "niemu", "nim", "nimi", "on", "ona", "one", "oni", "ono", "swa", "swą", "swe", "swego", "swej", "swemu", "swoi", "swoich", "swoim", "swoimi", "swoja", "swoją", "swoje", "swojego", "swojej", "swojemu", "swój", "swych", "swym", "swymi", "tobą", "tobie", "twa", "twą", "twe", "twego", "twej", "twemu", "twoi", "twoich", "twoim", "twoimi", "twoja", "twoją", "twoje", "twojego", "twojej", "twojemu", "twój", "twych", "twym", "twymi", "ty", "wam", "wami", "was", "wy", "wasz", "wasza", "wasze", "waszym", "waszymi", "waszych", "waszego", "waszej", "waszą") eng.pronouns = c("he", "her", "hers", "herself", "him", "himself", "his", "i", "me", "mine", "my", "myself", "our", "ours", "ourselves", "she", "thee", "their", "them", "themselves", "they", "thou", "thy", "thyself", "us", "we", "ye", "you", "your", "yours", "yourself") lat.pronouns = c("ea", "eae", "eam", "earum", "eas", "ego", "ei", "eis", "eius", "eo", "eorum", "eos", "eum", "id", "illa", "illae", "illam", "illarum", "illas", "ille", "illi", "illis", "illius", "illo", "illorum", "illos", "illud", "illum", "is", "me", "mea", "meae", "meam", "mearum", "meas", "mei", "meis", "meo", "meos", "meorum", "meum", "meus", "mihi", "nobis", "nos", "noster", "nostra", "nostrae", "nostram", "nostrarum", "nostras", "nostri", "nostris", "nostro", "nostros", "nostrorum", "nostrum", "sua", "suae", "suam", "suarum", "suas", "sui", "suis", "suo", "suos", "suorum", "suum", "suus", "te", "tibi", "tu", "tua", "tuae", "tuam", "tuarum", "tuas", "tui", "tuis", "tuo", "tuos", "tuorum", "tuum", "tuus", "vester", "vestra", "vestrae", "vestram", "vestrarum", "vestras", "vestri", "vestris", "vestro", "vestros", "vestrorum", "vestrum", "vobis", "vos") fra.pronouns = c("je", "me", "moi", "tu", "te", "toi", "il", "elle", "le", "la", "lui", "se", "lui", "elle", "soi", "nous", "vous", "ils", "elles", "les", "leur", "se", "eux", "elles", "soi") ger.pronouns = c("ich", "mich", "mir", "mein", "meine", "meiner", "meines", "du", "dich", "dir", "dein", "deine", "deiner", "deines", "er", "sich", "ihr", "ihrer", "ihn", "ihnen", "sein", "seiner", "seines", "seine", "sie", "wir", "uns", "unser", "unsere", "euch", "eure", "euer") ita.pronouns = c("ci", "gli", "io", "la", "le", "lei", "li", "loro", "lo", "lui", "me", "mi", "noi", "si", "te", "ti", "tu", "vi", "voi", "egli", "ella", "esso", "essa", "essi", "esse", "mio", "mia", "miei", "mie", "tuo", "tua", "tuoi", "tue", "suo", "sua", "suoi", "sue", "nostro", "nostra", "nostri", "nostre", "vostro", "vostra", "vostri", "vostre", "loro", "loro", "loro", "loro") hun.pronouns = c("annak", "az", "azzal", "bele", "belé", "beléd", "beléje", "beléjük", "belém", "belénk", "belétek", "belöle", "belőled", "belőlem", "belőletek", "belőlük", "belőlünk", "benne", "benned", "bennem", "bennetek", "bennük", "bennünk", "én", "ennek", "enyéim", "enyém", "enyémek", "érte", "érted", "értem", "értetek", "értük", "értünk", "ez", "ezzel", "hozzá", "hozzád", "hozzája", "hozzájuk", "hozzám", "hozzánk", "hozzátok", "maga", "magáé", "magáéi", "maguk", "maguké", "magukéi", "mi", "mieink", "mienk", "miénk", "nála", "nálad", "nálam", "nálatok", "náluk", "nálunk", "neked", "nekem", "neki", "nekik", "nektek", "nekünk", "ő", "ők", "ön", "öné", "önéi", "önnek", "önnel", "önök", "önöké", "önökéi", "önökkel", "önöknek", "övé", "övéi", "övéik", "övék", "rád", "rája", "rajta", "rajtad", "rajtam", "rajtatok", "rajtuk", "rajtunk", "rájuk", "rám", "ránk", "rátok", "róla", "rólad", "rólam", "rólatok", "róluk", "rólunk", "te", "ti", "tied", "tiéd", "tieid", "tieitek ", "tietek", "tiétek", "tőle", "tőled", "tőlem", "töletek", "tőlük", "tőlünk", "vele", "veled", "velem", "veletek", "velük", "velünk") # when analyzed texts are significantly unequal in length, it is not a bad # idea to prepare samples as randomly chosen "bags of words". If this option # is switched on, the desired size of a sample should be indicated. # Sampling with and without replacement is also available. # (Further reading: Eder, M. (2010). Does Size Matter? Authorship Attribution, # Short Samples, Big Problem. In "Digital Humanities 2010: Conference # Abstracts." King's College London 2010, pp. 132-35.) # # ATTENTION: this makes sense only if "use.existing.freq.tables" is set "FALSE" random.sampling = FALSE length.of.random.sample = 10000 sampling.with.replacement = FALSE # the variables are now ready to use (unless the GUI option was chosen) # ################################################################### # ################################################# # checking some of the initial variables -- just in case # ################################################# # Given a language option ("English", "Polish", "Latin" etc., as described # above), this procedure selects one of the lists of pronouns # If no language was chosen (or if a desired language is not supported, or if # there was a spelling mistake), then the variable will be set to "English". # If "Pronouns deleted" is set to FALSE, this is immaterial. if(exists("pronouns") == FALSE) # checking if the "pronouns" box is empty pronouns = eng.pronouns # This prevents us from choosing a non-existing distance measure -- in such # case the default distance (Classic Delta) will be switched on. Be aware # of correct spelling: then the default value will be assigned as well! if(distance.measure %in% c("CD","AL","ED","ES","MH","CB","EU") == FALSE) { distance.measure = "CD" } classification.method = tolower(classification.method) # ################################################# if(file.exists("config.txt") == TRUE) { source("config.txt") } # ############################################################################# # Final settings (you are advised rather not to change them) # ############################################################################# # A chosen language option should be followed by an assignment of # the appropriate set of pronouns. The following code is responsible for it if(corpus.lang == "English") pronouns = eng.pronouns if(corpus.lang == "Polish") pronouns = pol.pronouns if(corpus.lang == "Latin") pronouns = lat.pronouns if(corpus.lang == "French") pronouns = fra.pronouns if(corpus.lang == "German" ) pronouns = ger.pronouns if(corpus.lang == "Italian") pronouns = ita.pronouns if(corpus.lang == "Hungarian") pronouns = hun.pronouns # Since it it not so easy to perform, say, 17.9 iterations, or analyze # 543.3 words, the code below justifies all numerical variables, to prevent # you from your stupid jokes with funny settings. (OK, it is still # possible to crash the script but we will not give you a hint) mfw.min = round(mfw.min) mfw.max = round(mfw.max) mfw.incr = round(mfw.incr) start.at = round(start.at) culling.min = round(culling.min) culling.max = round(culling.max) culling.incr = round(culling.incr) mfw.list.cutoff = round(mfw.list.cutoff) # This also prevents from unexpected settings if(number.of.candidates < 1) { number.of.candidates = 1 number.of.candidates = round(number.of.candidates) } # Finally, we want to save some variable values for later use cat("",file="config.txt",append=F) var.name <- function(x) { if(is.character(x)==TRUE) { cat(paste(deparse(substitute(x)),"=\"",x,"\"", sep=""),file="config.txt",sep="\n",append=T) } else { cat(paste(deparse(substitute(x)),x, sep="="),file="config.txt",sep="\n",append=T) } } var.name(corpus.format) var.name(corpus.lang) var.name(analyzed.features) var.name(ngram.size) var.name(random.sampling) var.name(length.of.random.sample) var.name(classification.method) var.name(mfw.min) var.name(mfw.max) var.name(mfw.incr) var.name(start.at) var.name(mfw.list.cutoff) var.name(culling.min) var.name(culling.max) var.name(culling.incr) var.name(delete.pronouns) var.name(culling.of.all.samples) var.name(final.ranking.of.candidates) var.name(how.many.correct.attributions) var.name(use.existing.freq.tables) var.name(use.existing.wordlist) var.name(distance.measure) var.name(number.of.candidates) var.name(z.scores.of.all.samples) var.name(reference.wordlist.of.all.samples) # ############################################################################# # ############################################################################# # ################################################# # FUNCTIONS: # ################################################# # Function for combining single features (words # or letters) into n-grams, or strings of n elements; # e.g. letter 2-grams of the sentence "This is a sentence" # are as follows: "th", "hi", "is", "s ", " i", "is", etc. # Required argument: name of the vector of words/letters # ################################################# make.ngrams = function(input.text) { txt = c() if(ngram.size > 1) { txt = input.text for(n in 2:ngram.size) { txt = paste(txt[1:(length(txt)-1)],input.text[n:length(input.text)]) } } else { # if n-gram size is set to 1, then nothing will happen txt = input.text } return(txt) } # ################################################# # Generic function for splitting a given input text into # single words (chains of characters delimited with # spaces or punctuation marks). Alternatively, # you can write here another rule for splitting. # Required argument: name of the text to be split. # ATTENTION: this is the only piece of coding in this script # that dependens on the operating system used # ################################################# split.into.words = function(input.text) { # splitting into units specified by regular expression; here, # all sequences between non-letter characters are assumed to be words: ### Linux, Mac #tokenized.txt = c(unlist(strsplit(input.text, "[^[:alpha:]]+"))) ### Windows tokenized.txt = c(unlist(strsplit(input.text, "\\W+",perl=T))) return(tokenized.txt) } # ################################################# # Function for splitting a given input text into # single words (chains of characters delimited with # spaces or punctuation marks). There is also an option # of splitting the text into letters and/or performing # splitting into n-grams # ################################################# split.sample = function(input.text) { # loading the file; optionally, fiddling with dashes and contractions: # # this is the standard procedure of splitting input texts if(corpus.lang != "English.contr" && corpus.lang != "English.all") { tokenized.sample = split.into.words(input.text) } # if Latin option with adjusting the v/u letters was switched on, # smashing the distinction and converting both types to the letter u if(corpus.lang == "Latin.corr") { tokenized.sample = gsub("v","u",tokenized.sample) } # this code is used for English corpora only if(corpus.lang == "English.contr" || corpus.lang == "English.all") { # replacing non-ASCII apostrophes with simple ' (standard ASCII char) tokenized.sample = gsub("’","'",input.text) # getting rid of contractions ('t, 's, 've, 'd, 'll, 'em, 'im) by replacing # their apostrophes with ^ (other apostrophes will not be replaced) tokenized.sample = gsub("([[:alpha:]])'([tsdm]|ll|ve|em|im)\\b","\\1^\\2", tokenized.sample) # adding spaces around dashes (to distinguish dashes and hyphens) tokenized.sample = gsub("[-]{2,5}"," -- ",tokenized.sample) # depending on which option was swithed on, either the contractions are # kept, or all the peculiarities, i.e. both contractions and hyphens if(corpus.lang == "English.contr") { tokenized.sample=c(unlist(strsplit(tokenized.sample,"[^[:alpha:]^]+"))) } if(corpus.lang == "English.all") { tokenized.sample=c(unlist(strsplit(tokenized.sample,"[^[:alpha:]^-]+"))) # trying to clean the remaining dashes: tokenized.sample = gsub("^[-]+$","",tokenized.sample) } } # trying to avoid empty strings: tokenized.sample = tokenized.sample[nchar(tokenized.sample)>0] # trying to get rid of non-letter characters: tokenized.sample = tokenized.sample[grep("[^[:digit:]]",tokenized.sample)] # # # splitting the sample into letters (if analyzed.features was set to "l") if(analyzed.features == "l") { tokenized.sample = paste(tokenized.sample, collapse=" ") tokenized.sample = unlist(strsplit(tokenized.sample,"")) } # # making n-grams (if the value "n" has been set to 2 or more) if(ngram.size > 1) { tokenized.sample = make.ngrams(tokenized.sample) } # the result of the function: return(tokenized.sample) } # ################################################# # Function for adjusting different input formats: # xml (TEI) in two variants, html, and plain text files. # Required argument: name of the text to pre-process # ################################################# delete.markup = function(input.text) { if(corpus.format == "xml" || corpus.format == "xml.drama") { # getting rid of TEI header (if exists) if(length(grep("",input.text)) > 0) { input.text = input.text[-c(1:(grep("",input.text)))] } # the whole text into one (very) long line preprocessed.text = paste(input.text, collapse=" ") # getting rid of dramatis personae if(corpus.format == "xml.drama"){ preprocessed.text = gsub(".*?","",preprocessed.text) } # getting rid of comments and (editorial) notes preprocessed.text = gsub("","",preprocessed.text) # getting rid of all the remaining tags preprocessed.text = gsub("<.*?>","",preprocessed.text) } if(corpus.format == "html") { # getting rid of html header (if exists) if(length(grep(" 0) { input.text = input.text[-c(1:(grep("","",preprocessed.text) # getting rid of all the remaining tags preprocessed.text = gsub("<.*?>","",preprocessed.text) } else { preprocessed.text = input.text } return(preprocessed.text) } # ################################################# # Function for preparing and printing # the final ranking of the least unlikely authors # for each text in a given table of distances. # Arguments: 1. table with distances, 2. number of candidates to be displayed # ################################################# make.ranking.of.candidates = function(dist.matrix,candidates) { for(h in 1:length(dist.matrix[,1])) { ranked.candidates = order(dist.matrix[h,]) current.sample = c(gsub("_.*","",colnames(dist.matrix)))[ranked.candidates] if((gsub("_.*","",rownames(dist.matrix)))[h] != current.sample[1]) { cat(c(c(rownames(dist.matrix))[h]," ", "-->", " ", current.sample[1:candidates], " (delta score:", round(dist.matrix[h,ranked.candidates[1:candidates]],4), ")", "\n"),file=outputfile,append=T) } } } # ################################################# # Function for preparing and printing # the number of correct attributions # this is a variant of the above function make.ranking.of.candidates # Argument: table of distances # ################################################# make.number.of.correct.attributions = function(dist.matrix) { corr.attrib = 0 for(h in 1:length(dist.matrix[,1])) { ranked.c = order(dist.matrix[h,]) current.sample = c(gsub("_.*","",colnames(dist.matrix)))[ranked.c] if((gsub("_.*","",rownames(dist.matrix)))[h] == current.sample[1]) { corr.attrib = corr.attrib + 1 } } # the result of the function: return(corr.attrib) } # ############################################################################# # ################################################# # the module for loading a corpus from text files; # it can be omitted if the frequency table for # both primary and secondary sets already exist # (then "use.existing.freq.tables" should be set # to TRUE in the preamble of the script/GUI) # ################################################# # # # # Checking: (1) whether produce new frequency tables or use existing ones; # (2) whether the tables are stored in memory or written into files. # If you have chosen using the existing tables and there are no such tables # available, then your choice will be ignored if(use.existing.freq.tables == TRUE && file.exists("freq_table_primary_set.txt") == TRUE && file.exists("freq_table_secondary_set.txt") == TRUE ) { if(exists("freq.I.set.0.culling") && exists("freq.II.set.0.culling")) { cat("\n", "using frequency tables stored as variables...", "\n") } else { cat("\n", "reading files with frequency tables...", "\n") freq.I.set.0.culling = t(read.table("freq_table_primary_set.txt")) freq.II.set.0.culling = t(read.table("freq_table_secondary_set.txt")) cat("\n", "frequency tables loaded successfully", "\n\n") } # extracting names of the samples filenames.primary.set = rownames(freq.I.set.0.culling) filenames.secondary.set = rownames(freq.II.set.0.culling) # # checking whether existing wordlist should be used as well if (use.existing.wordlist == TRUE && file.exists("wordlist.txt") == TRUE){ cat("\n", "reading a wordlist from file...", "\n") mfw.list.of.all = scan("wordlist.txt",what="char",sep="\n") mfw.list.of.all = c(grep("^[^#]",mfw.list.of.all,value=TRUE)) # # adjusting the size of frequency tables with the existing wordlist freq.I.set.0.culling = freq.I.set.0.culling[,colnames(freq.I.set.0.culling) %in% mfw.list.of.all] freq.II.set.0.culling = freq.II.set.0.culling[,colnames(freq.II.set.0.culling) %in% mfw.list.of.all] } else { # the wordlist will be created from existing tables with frequencies mfw.list.of.all = colnames(freq.I.set.0.culling) # some comments into the file containing wordlist cat("# This file contains the words that were used in the table", "# of frequencies uploaded from an external file. The current list", "# can be used for the next tasks, and for this purpose it can be", "# manually revised, edited, deleted, culled, etc.", "# You can either delete unwanted words, or mark them with \"#\"", "# -----------------------------------------------------------------", "", file="wordlist.txt", sep="\n") # the current wordlist into a file cat(mfw.list.of.all, file="wordlist.txt", sep="\n",append=F) } # if existing tables will not be used, then begin producing new tables } else { # # Retrieving the names of samples # filenames.primary.set = list.files("primary_set") filenames.secondary.set = list.files("secondary_set") # # Checking whether required files and subdirectories exist if(file.exists("primary_set")==FALSE || file.exists("secondary_set")==FALSE) { cat("\n\n !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n", "Working directory should contain two subdirectories: \"primary_set\" and \"secondary_set\"\n", "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n") stop("corpus prepared incorrectly") } if(length(filenames.primary.set) < 2 || length(filenames.secondary.set) < 2) { cat("\n\n !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n", "Both subdirectories \"primary_set\" and \"secondary_set\" should contain at least two text samples!\n", "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n") stop("corpus prepared incorrectly") } # # # # loading the primary set from text files corpus.of.primary.set = list() setwd("primary_set") for (file in filenames.primary.set) { # loading the next file from the list filenames.primary.set, current.file = tolower(scan(file,what="char",sep="\n", quiet=T)) # delete xml/html markup (if applicable) current.file = delete.markup(current.file) # deleting punctuation, splitting into words: split.file = split.sample(current.file) # if the current text is too short, abort the script if(length(split.file) < 10) { cat("\n\n",file, "\t", "this sample is VERY short!", "\n\n") setwd(".."); stop("corpus error") } # appending the current text to the virtual corpus corpus.of.primary.set[[file]] = split.file cat(file, "\t", "loaded successfully", "\n") } setwd("..") # # loading the secondary set from text files corpus.of.secondary.set = list() setwd("secondary_set") for (file in filenames.secondary.set) { # loading the next file from the list filenames.secondary.set, current.file = tolower(scan(file,what="char",sep="\n", quiet=T)) # delete xml/html markup (if applicable) current.file = delete.markup(current.file) # deleting punctuation, splitting into words: split.file = split.sample(current.file) # if the current text is too short, abort the script if(length(split.file) < 10) { cat("\n\n",file, "\t", "this sample is VERY short!", "\n\n") setwd(".."); stop("corpus error") } # appending the current text to the virtual corpus corpus.of.secondary.set[[file]] = split.file cat(file, "\t", "loaded successfully", "\n") } setwd("..") # blank line on the screen cat("\n") # # # # both directories (primary_set and secondary_set) shoud contain some texts; # if the number of text samples is lower than 2, the script will stop if(length(corpus.of.primary.set) < 2 || length(corpus.of.secondary.set) < 2) { cat("\n\n","either primary_set or secondary_set is empty!", "\n\n") stop("corpus error") } # # # We need a list of the most frequent words used in the current corpus, # in descending order, without frequencies (just a list of words). It can be # either loaded from a file (then set the option "use.existing.wordlist=TRUE"), # or created by the code provided below: # if (use.existing.wordlist == TRUE && file.exists("wordlist.txt") == TRUE) { cat("\n", "reading a wordlist from file...", "\n") mfw.list.of.all = scan("wordlist.txt",what="char",sep="\n") mfw.list.of.all = c(grep("^[^#]",mfw.list.of.all,value=TRUE)) } else { # Extracting all the words used in the texts of primary set # (or both if "Z-scores all" is set to TRUE) # wordlist.of.primary.set = c() for (file in 1 : length(corpus.of.primary.set)) { # loading the next sample from the list filenames.primary.set, current.text = corpus.of.primary.set[[file]] # putting samples together: wordlist.of.primary.set = c(wordlist.of.primary.set, current.text) cat(names(corpus.of.primary.set[file]),"\t","tokenized successfully", "\n") } # including words of the secondary set in the reference wordlist (if specified) if (reference.wordlist.of.all.samples == TRUE) { wordlist.of.secondary.set = c() for (file in 1 : length(corpus.of.secondary.set)) { # loading the next sample from the list filenames.secondary.set, current.text = corpus.of.secondary.set[[file]] # putting samples together: wordlist.of.secondary.set = c(wordlist.of.secondary.set, current.text) cat(names(corpus.of.secondary.set[file]),"\t","tokenized successfully","\n") } } else { wordlist.of.secondary.set = c()} # # # preparing a sorted frequency list of the whole primary set mfw.list.of.all = sort(table( c(wordlist.of.primary.set,wordlist.of.secondary.set)), decreasing=T) # if the whole list is long, then cut off the tail (e.g., > 5000 mfw) if (length(mfw.list.of.all) > mfw.list.cutoff) { mfw.list.of.all = mfw.list.of.all[1:mfw.list.cutoff] } # the only thing we need are words ordered by frequency (no frequencies) mfw.list.of.all = names(mfw.list.of.all) # # some comments into the file containing wordlist cat("# This file contains the words that were used for building the table", "# of frequencies. It can be also used for the next tasks, and for this", "# purpose it can be manually revised, edited, deleted, culled, etc.", "# You can either delete unwanted words, or mark them with \"#\"", "# -----------------------------------------------------------------------", "", file="wordlist.txt", sep="\n") # the current wordlist into a file cat(mfw.list.of.all, file="wordlist.txt", sep="\n",append=F) # } # <----- conditional expr. "use.existing.wordlist" terminates here # # # # # blank line on the screen cat("\n") # # # ################################################# # FUNCTION: make.paralel.frequency.lists() # preparing a huge table with all the frequencies (> mwf.list.cutoff). # Two arguments are required -- a vector with filenames # and a specified variable where the corpus is stored (in a list) # ################################################# # make.paralel.frequency.lists = function(filenames,current.corpus) { freq.list.of.all.the.samples = c() freq.list.of.current.sample = c() for (file in filenames) { # loading the next sample from the list filenames.primary.set, current.sample = current.corpus[[file]] # # if random sampling was chosen, the text will be randomized and # a sample of a given lenght will be excerpted if(random.sampling == TRUE) { current.sample = head(sample(current.sample, replace = sampling.with.replacement), length.of.random.sample) } # # # preparing the frequency list of the current sample raw.freq = table(current.sample) * 100 / length(current.sample) # adjusting the frequency list to the main MFW list obtained above freq.list.of.current.sample = raw.freq[mfw.list.of.all] # taking the names (sc. words) from the main MFW list names(freq.list.of.current.sample) = mfw.list.of.all # and sticking the current sample into the general frequency table freq.list.of.all.the.samples = rbind(freq.list.of.all.the.samples, freq.list.of.current.sample) # a short message on the screen: cat(file, "\t", "excerpted successfully", "\n") } # adjusting names of the rows (=samples) rownames(freq.list.of.all.the.samples) = c(filenames) # the result of the function return(freq.list.of.all.the.samples) } # # # preparing a huge table of all the frequencies for the primary set freq.I.set.0.culling = make.paralel.frequency.lists(filenames.primary.set,corpus.of.primary.set) # all NA values will be adjusted to 0 freq.I.set.0.culling[which(is.na(freq.I.set.0.culling))] = 0 # # # preparing a huge table of all the frequencies for the secondary set freq.II.set.0.culling = make.paralel.frequency.lists(filenames.secondary.set,corpus.of.secondary.set) # all NA values will be adjusted to 0 freq.II.set.0.culling[which(is.na(freq.II.set.0.culling))] = 0 # # # writing the frequency tables to text files (they can be re-used!) write.table(t(freq.I.set.0.culling), file="freq_table_primary_set.txt", sep="\t", row.names=TRUE, col.names=TRUE) write.table(t(freq.II.set.0.culling), file="freq_table_secondary_set.txt", sep="\t", row.names=TRUE, col.names=TRUE) # } # <----- conditional expr. "use.existing.freq.tables" terminates here # # # ################################################# # the module for loading the corpus terminates here # ################################################# # ################################################# # MAIN PROGRAM; the main loop is below # ################################################# # cleaning the outputfile cat("",file=outputfile,append=F) # saving the original mfw.max value in mfw.max.original # this is useful for subtitles of bootstrap graphs mfw.max.original = mfw.max # the general counter for different purposes: initiation number.of.current.iteration = 0 # useful for diagnostic reasons; this will be reported in the logfile total.no.of.correct.attrib = c() total.no.of.possible.attrib = c() # retrieving the total number of texts to be "guessed" # (anonymous texts and unique authorial samples will not be counted) authors.I.set = c(gsub("_.*","",rownames(freq.I.set.0.culling))) authors.II.set = c(gsub("_.*","",rownames(freq.II.set.0.culling))) perfect.guessing = length(authors.II.set[authors.II.set %in% authors.I.set]) # load the ape library; make an empty bootstrap.results list # this will be executed only if the bootstrap option was checked #if (make.consensus.tree == TRUE) { # library(ape) # bootstrap.list = list() # } # ################################################# # module for culling (THE MAIN LOOP IN THE PROGRAM) # ################################################# # ################################################# # ################################################# # module for culling (THE MAIN LOOP IN THE PROGRAM) # ################################################# # testing if desired culling settings are acceptable; # if too large, it is set to maximum possible if(culling.max > 100) { culling.max = 100 } # if too small, it is set to 0 (i.e. minimal value) if(culling.min < 0) { culling.min = 0 } # avoiding infinite loops if(culling.incr <= 1) { culling.incr = 10 } # ################################################# for(j in (culling.min/culling.incr):(culling.max/culling.incr)) { current.culling = j * culling.incr # the beginning of the culling procedure (this is to be done # on the primary set only; the secondary set is using the same list!) # raw.list.after.culling = c() # extracting non-zero values from primary set frequency table, # or from both frequency tables (if specified) if(culling.of.all.samples == FALSE) { nonzero.values = freq.I.set.0.culling > 0 } else { nonzero.values = rbind(freq.I.set.0.culling,freq.II.set.0.culling) > 0 } # counting of how many non-zero values there are for (y in 1: length(nonzero.values[1,])) { raw.list.after.culling = c(raw.list.after.culling, (length(grep("TRUE",nonzero.values[,y])) / length(nonzero.values[,y])) >= current.culling/100 ) } # a raw culling list has no word-identification; let's change it: names(raw.list.after.culling) = colnames(freq.I.set.0.culling) # a simple sequence of words which were not culled list.of.words.after.culling = c(names(raw.list.after.culling[grep("TRUE",raw.list.after.culling)])) # procedure for deleting pronouns if (delete.pronouns == TRUE) { list.of.words.after.culling = list.of.words.after.culling[!(list.of.words.after.culling %in% pronouns)] } # the above list-of-not-culled to be applied to both sets: primary.set = freq.I.set.0.culling[,c(list.of.words.after.culling)] rownames(primary.set) = filenames.primary.set secondary.set = freq.II.set.0.culling[,c(list.of.words.after.culling)] rownames(secondary.set) = filenames.secondary.set # ################################################# # culling is done, but we are still inside the main loop # starting the frequency list at frequency rank set in option start.at above primary.set = primary.set[,start.at:length(primary.set[1,])] secondary.set = secondary.set[,start.at:length(secondary.set[1,])] # Testing if the desired MFW number is acceptable, # if MFW too large, it is set to maximum possible. if(mfw.max > length(primary.set[1,])) { mfw.max = length(primary.set[1,]) } # if too small, it is set to 1 (i.e., minimal value) if(mfw.min < 1) { mfw.min = 1 } # if culling is too strong, sometimes strange things may happen; let's block it if(mfw.min > mfw.max) { mfw.min = mfw.max } # MFW set to mfw.max for a while (it will change later on) mfw = mfw.max cat("\n") cat("culling @ ", current.culling,"\t","available words ",mfw.max,"\n") # ################################################# # z-scores calcutations # ################################################# # mean and standard dev. for each word (in primary set) primary.set.mean = c(sapply(as.data.frame(primary.set), mean)) primary.set.sd = c(sapply(as.data.frame(primary.set), sd)) # calculating z-scores for both I and II sets (a message on the screen) cat("Calculating z-scores... \n\n") # an additional table composed of relative word frequencies # of joint primary and secondary sets freq.table.both.sets = rbind(primary.set[,1:mfw.max], secondary.set[,1:mfw.max]) # Entropy distance: experimental, but entirely available # (the results do not really differ than for typical word frequencies) # #A = t(t(freq.table.both.sets + 1) / colSums(freq.table.both.sets + 1)) #B =t(t(log(freq.table.both.sets + 2)) / -(colSums(A * log(A)))) #freq.table.both.sets = B # # calculating z-scores either of primary set, or of both sets if(z.scores.of.all.samples == FALSE) { # function for z-scores scaling executed for primary.set zscores.primary.set = scale(primary.set) rownames(zscores.primary.set) = rownames(primary.set) # function for z-scores scaling executed for secondary.set zscores.secondary.set = scale(secondary.set, center=primary.set.mean, scale=primary.set.sd) rownames(zscores.secondary.set) = rownames(secondary.set) # the two tables with calculated z-scores should be put together zscores.table.both.sets = rbind(zscores.primary.set, zscores.secondary.set) } else { # the z-scores can be calculated on both sets as alternatively zscores.table.both.sets = scale(freq.table.both.sets) zscores.table.both.sets = zscores.table.both.sets[,] } # ################################################# # the internal loop starts here (for i = mfw.min : mfw.max) # ################################################# # a short message on the screen: if(distance.measure == "CD") { cat("Calculating classic Delta distances... \n") } if(distance.measure == "AL") { cat("Calculating Argamon's Delta distances... \n") } if(distance.measure == "ED") { cat("Calculating Eder's Delta distances... \n") } if(distance.measure == "ES") { cat("Calculating Eder's Simple distances... \n") } if(distance.measure == "MH") { cat("Calculating Mahattan distances... \n") } if(distance.measure == "CB") { cat("Calculating Canberra distances... \n") } if(distance.measure == "EU") { cat("Calculating Euclidean distances... \n") } for(i in (mfw.min/mfw.incr):(mfw.max/mfw.incr)) { mfw = i * mfw.incr # for safety reasons, if MFWs > words in samples if(mfw > length(list.of.words.after.culling) ) { mfw = length(list.of.words.after.culling) } # the general counter for different purposes number.of.current.iteration = number.of.current.iteration + 1 # the current task (number of MFW currently analyzed) echoed on the screen cat(mfw, " ") # ################################################# # module for Delta # ################################################# if(tolower(classification.method) == "delta") { # calculating classic Delta distances if(distance.measure == "CD") { distance.name.on.graph = "Classic Delta distance" distance.name.on.file = "Classic Delta" distance.table = as.matrix(dist(zscores.table.both.sets[,1:mfw], method="manhattan")) / mfw } # calculating Argamon's "Linear Delta" if(distance.measure == "AL") { distance.name.on.graph = "Argamon's Delta distance" distance.name.on.file = "Argamon's Delta" distance.table = as.matrix(dist(zscores.table.both.sets[,1:mfw], method="euclidean")) / mfw } # calculating Delta distances with Eder's modifications if(distance.measure == "ED") { distance.name.on.graph = "Eder's Delta distance" distance.name.on.file = "Eder's Delta" zscores.plus.e.value = t(t(zscores.table.both.sets[,1:mfw])*((1+mfw:1)/mfw)) distance.table = as.matrix(dist(zscores.plus.e.value,method="manhattan")) } # calculating Eder's Simple distance to a matrix distance.table if(distance.measure == "ES") { distance.table = as.matrix(dist(sqrt(freq.table.both.sets[,1:mfw]),method="manhattan")) distance.name.on.graph = "Eder's Simple distance" distance.name.on.file = "Eder's Simple" } # calculating Manhattan distance to a matrix distance.table if(distance.measure == "MH") { distance.name.on.graph = "Manhattan distance" distance.name.on.file = "Manhattan" distance.table = as.matrix(dist(freq.table.both.sets[,1:mfw],method="manhattan")) } # calculating Canberra distance to a matrix distance.table if(distance.measure == "CB") { distance.name.on.graph = "Canberra distance" distance.name.on.file = "Canberra" distance.table = as.matrix(dist(freq.table.both.sets[,1:mfw],method="canberra")) } # calculating Euclidean distance to a matrix distance.table if(distance.measure == "EU") { distance.name.on.graph = "Euclidean distance" distance.name.on.file = "Euclidean" distance.table = as.matrix(dist(freq.table.both.sets[,1:mfw],method="euclid")) } # replaces the names of the samples (the extension ".txt" is cut off) rownames(distance.table)=gsub("\\.txt$","",rownames(zscores.table.both.sets)) colnames(distance.table)=gsub("\\.txt$","",rownames(zscores.table.both.sets)) # ################################################# # extracting candidates, drawing, printing, etc. # a selected area of the distance.table is needed, with colnames() no.of.possib = length(primary.set[,1]) no.of.candid = length(secondary.set[,1]) selected.dist = as.matrix(distance.table[no.of.possib+1:no.of.candid,1:no.of.possib]) } # <--- delta # tu kod dla SVM, kNN, NSC, NaiveBayes # # # # # # distance.name.on.graph = "to be deleted" if(tolower(classification.method) == "knn") { #kNN classification: library(class) # # training_set and test_set preparation; adding class labels to both sets classes.training = gsub("_.*","",rownames(zscores.primary.set)) training.set = cbind(classes.training,zscores.primary.set[,1:mfw]) classes.test = gsub("_.*","",rownames(zscores.secondary.set)) test.set = cbind(classes.test,zscores.secondary.set[,1:mfw]) # input.data = as.data.frame(rbind(training.set,test.set)) # # number of nearest neighbors to be considered k.value = 1 # classes that will be used for training the classifier (=classes of I set) classes = factor(training.set[,1]) # training and classification classification.results = knn(training.set[,-1],test.set[,-1],classes,k=k.value) # cross-validation: #knn.cv(training.set[,-1],classes,k=k.value,prob=T) classification.results = as.character(classification.results) } if(tolower(classification.method) == "naivebayes") { # Naive Bayes classification: library(e1071) # # training_set and test_set preparation; adding class labels to both sets training.set = primary.set[,1:mfw] test.set = secondary.set[,1:mfw] classes.training = gsub("_.*","",rownames(training.set)) classes.test = gsub("_.*","",rownames(test.set)) classes = c(classes.training, classes.test) input.data = as.data.frame(rbind(training.set,test.set)) input.data = cbind(classes, input.data) training.classes = c(1:length(training.set[,1])) # # training a model model = naiveBayes(classes ~ ., data = input.data, subset = training.classes) # # testing the model on "new" data (i.e. the test.set) classification.results = predict(model, input.data[,-1]) classification.results = as.character(classification.results) classification.results = classification.results[-c(1:length(classes.training))] } if(tolower(classification.method) == "svm") { # Support Vector Machines classification: library(e1071) # # training_set and test_set preparation; adding class labels to both sets training.set = primary.set[,1:mfw] test.set = secondary.set[,1:mfw] classes.training = gsub("_.*","",rownames(training.set)) classes.test = gsub("_.*","",rownames(test.set)) classes = c(classes.training, classes.test) input.data = as.data.frame(rbind(training.set,test.set)) input.data = cbind(classes, input.data) training.classes = c(1:length(training.set[,1])) # # training a model model = svm(classes ~ ., data = input.data, subset = training.classes) # # testing the model on "new" data (i.e. the test.set) classification.results = predict(model, input.data[,-1]) classification.results = as.character(classification.results) classification.results = classification.results[-c(1:length(classes.training))] #plot(cmdscale(dist(input.data[,-1])),col=as.integer(input.data[,1]),pch=c("o","+")) } if(tolower(classification.method) == "nsc") { # Nearest Shrunken Centroid classification: library(pamr) # # training_set and test_set preparation; adding class labels to both sets training.set = primary.set[,1:mfw] test.set = secondary.set[,1:mfw] classes.training = gsub("_.*","",rownames(training.set)) classes.test = gsub("_.*","",rownames(test.set)) classes = c(classes.training, classes.test) input.data = as.data.frame(rbind(training.set,test.set)) training.classes = c(1:length(training.set[,1])) mydata=list(x=t(input.data),y=as.factor(classes)) # training a model model = pamr.train(mydata,sample.subset=c(1:length(classes.training))) # testing the model on "new" data (i.e. the test.set) classification.results = pamr.predict(model,mydata$x,threshold=1) classification.results = as.character(classification.results) classification.results = classification.results[-c(1:length(classes.training))] } # returns the ranking of the most likely candidates as a list if(final.ranking.of.candidates == TRUE) { cat("\n\n\n",file=outputfile,append=T) if(tolower(classification.method) == "delta") { make.ranking.of.candidates(selected.dist,number.of.candidates) } else { misclassified.samples = paste(rownames(test.set), "\t-->\t", classification.results)[classes.test!=classification.results] cat(misclassified.samples,file=outputfile,append=T,sep="\n") } } # returns the number of correct attributions if(how.many.correct.attributions == TRUE) { if(tolower(classification.method) == "delta") { no.of.correct.attrib = make.number.of.correct.attributions(selected.dist) } else { no.of.correct.attrib = sum(as.numeric(classes.test == classification.results)) } total.no.of.correct.attrib = c(total.no.of.correct.attrib, no.of.correct.attrib) total.no.of.possible.attrib = c(total.no.of.possible.attrib, perfect.guessing) cat("\n",file=outputfile,append=T) cat(mfw, " MFW , culled @ ",current.culling,"%, ", no.of.correct.attrib," of ", perfect.guessing,"\t(", round(no.of.correct.attrib / perfect.guessing * 100, 1),"%)", "\n",file=outputfile,append=T,sep="") } } # <-- the internal loop for(i) returns here # ################################################# # blank line on the screen cat("\n") } # <-- the main loop for(j) returns here # ################################################# all.guesses = total.no.of.correct.attrib / total.no.of.possible.attrib * 100 total.no.of.correct.attrib = sum(total.no.of.correct.attrib) total.no.of.possible.attrib = sum(total.no.of.possible.attrib) # information about the current task into the logfile cat("\nGeneral attributive success: ", total.no.of.correct.attrib, " of ", total.no.of.possible.attrib, " (", round(total.no.of.correct.attrib/total.no.of.possible.attrib*100, 1), "%)\n",file=outputfile,append=T,sep="") cat("\nMFWs from ",mfw.min," to ",mfw.max.original, " @ increment ",mfw.incr,"\nCulling from ",culling.min, " to ",culling.max," @ increment ",culling.incr, "\nPronouns deleted: ",delete.pronouns,"; ", distance.name.on.graph,"\n",file=outputfile,append=T,sep="") # additional empty line in outputfile (EOF) cat("\n",file=outputfile,append=T) # the same information (about the current task) on screen cat("\nGeneral attributive success: ", total.no.of.correct.attrib, " of ", total.no.of.possible.attrib, " (", round(total.no.of.correct.attrib/total.no.of.possible.attrib*100,1), "%, sd =", round(sd(all.guesses),1),"%)\n") cat("\nMFWs from ",mfw.min," to ",mfw.max.original, " @ increment ",mfw.incr,"\nCulling from ",culling.min, " to ",culling.max," @ increment ",culling.incr, "\nPronouns deleted: ",delete.pronouns,"; ", distance.name.on.graph,"\n",sep="") # ################################################# # final cleaning cat("\n") cat("removing most of the variables... \n") cat("type ls() if you want to see what was not removed\n") cat("if you are going to change the corpus, clean all: rm(list=ls())\n") cat("\n") cat("Results saved in", outputfile, "\n") cat("\n") # a list of variables not to be removed do.not.remove = c("zscores.table.both.sets", "freq.table.both.sets", "freq.I.set.0.culling", "freq.II.set.0.culling", "distance.table","outputfile","all.guesses") # removing the variables which are not on the above list list.of.variables = ls() #rm(list=list.of.variables[!(list.of.variables %in% do.not.remove)]) # #################################################