##############################################################################
# stylometric classification test, version 0.4.5 (beta release)
#
# Copyright (C) 2009-2012 by Maciej Eder & Jan Rybicki.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# ############################################################################
# 
# To cite this script in publications you might use:
#   Eder, M., Rybicki, J. (2011). Stylometry with R. In "Digital Humanities 
#   2011: Conference Abstracts." Stanford University, Stanford, CA, pp. 308-11.
#
# Contact with the authors:
#     Maciej Eder <maciejeder@gmail.com>
#     Jan Rybicki <jkrybicki@gmail.com>
#
# ############################################################################
#
#
#
# ver. 0.4.5, 2012/07/12  --> further GUI rearrangements; code cleaned;
#                             bugs fixed
# ver. 0.4.4, 2012/05/31  --> the core code rewritten, the I/II set division 
#                             abandoned, GUI remodelled, tooltips added, 
#                             different input formats supported (xml etc.), 
#                             config options loaded from external file;
#                             the code forked into (1) this script, supporting 
#                             machine-learning methods (Delta, SVM, NSC, ...),
#                             (2) a script for explanatory analyses
# ver. 0.4.3, 2012/04/28  --> feature selection (word and letter n-grams)
# ver. 0.4.2, 2012/02/10  --> three ways of splitting words in English; 
#                             bugs fixed; GUI code rearranged and simplified
# ver. 0.4.1, 2011/06/27  --> better output; better text files uploading,
#                             new options for culling and ranking of candidates
# ver. 0.4.0, 2011/06/20  --> the official world-premiere (Stanford, CA)
# ver. 0.3.9b, 2011/06/1  --> the code simplified; minor cleaning
# ver. 0.3.9, 2011/05/21  --> uploading wordlist from external source;
#                             thousands of improvements; the code simplified
# ver. 0.3.8, 2010/11/01  --> skip top frequency words option added
# ver. 0.3.7, 2010/11/01  --> better graphs; attempt at better graph layout
# ver. 0.3.6, 2010/07/31  --> more graphic options; dozens of improvements
# ver. 0.3.5, 2010/07/19  --> module for colour graphs; module for PCA
# ver. 0.3.4, 2010/07/12  --> module for uploading corpus files improved
# ver. 0.3.3, 2010/06/03  --> the core code simplified and improved (faster!)
# ver. 0.3.2, 2010/05/10  --> reordered GUI; minor cleaning
# ver. 0.3.1, 2010/05/10  --> the z-scores module improved
# ver. 0.3.0, 2009/12/26  --> better counter of "good guesses"; option for
#                             randomly generated samples; minor improvements
# ver. 0.2.99, 2009/12/25 --> platform-independent outputfile saving
# ver. 0.2.98, 2009/12/24 --> GUI thoroughly integrated with initial variables
# ver. 0.2.10, 2009/11/28 --> corrected MFW display in graph, more analysis 
#                             description in outputfile
# ver. 0.2.9, 2009/11/22  --> auto graphs for MSD and CA
# ver. 0.2.8a, 2009/11/21 --> remodelled GUI
# ver. 0.2.8, 2009/11/20  --> GUI: radiobuttons, checkbuttons
# ver. 0.2.7, 2009/11/19  --> language-determined pronoun selection
# ver. 0.2.6, 2009/11/18  --> dialog box (GUI)
# ver. 0.2.5, 2009/11/16  --> module for different distance measures;
#                             thousands of improvements (I/O, interface, etc.)
# ver. 0.2.2, 2009/10/25  --> numerous little improvements; deleting pronouns
# ver. 0.2.1, 2009/08/23  --> module for culling; module for bootstrapping
# ver. 0.2.0, 2009/08/23  --> module for uploading plain text files
# ver. 0.1.9, 2009/07/31  --> numerous improvements, the code simplified
# ver. 0.1.4, 2009/07/19  --> loop for different MFW settings
# ver. 0.0.1, 2009/07/01  --> some bash and awk scripts translated into R
##############################################################################






#######  GENERAL SETTINGS (GUI/TEXT-MODE)  ###################################

# If you wish to use a simple yet effective graphical interface (GUI),
# just set the following option to TRUE, otherwise switch this option to FALSE
# and edit manually the rest of variables (see below).
# If you switch this option on, the values indicated in the following sections 
# will serve as default for the GUI (you can adapt them to your needs)

interactive.mode.with.GUI = FALSE


#######  TEXT- AND LANGUAGE-DEPENDENT SETTINGS  ####################


# format of corpus files; available choices are:
# "plain", "xml", "xml.drama", "xml.notitles", "html"
corpus.format = "plain"


# how many MFW should be taken into analysis (if mfw.min value = max.mfw, 
# then no multiple iterations will be computed)
# start.at option enables skipping top frequency words: you should
# indicate the desired start position of your list (in most cases you will 
# probably prefer setting it to 1)

mfw.min = 100
mfw.max = 100
mfw.incr = 100
start.at = 1

# culling rate specifies the number of texts in a corpus in which a given word 
# must be found in order to be included in the analysis. Thus, a 100% culling 
# rate limits the analysis to words that appear at least once in every text 
# in the corpus; at a 50% culling rate, a word is included into the analysis 
# when it appears in at least half of the texts in the corpus; a 0% culling 
# rate (or no culling) means that no words are omitted.
# about min=max: see above

culling.min = 0
culling.max = 0
culling.incr = 20

# Deleting pronouns (this is independent of the culling procedure).
# If deleting pronouns option is switched to TRUE, choose one language
# of the following: English, Polish, Latin, French, German, Italian, Hungarian
# (the editable lists of pronouns are available below; see: advanced settings).
# Additionally, there are a few variants of language settings available:
# English.contr, English.all, and Latin.corr. Their meaning is as follows:
#     "English.contr": treats the contractions as signle words, i.e. strings
#         such as "don't", "you've" etc. will not be split into two words.
#     "English.all": keeps the contractions (as above), and also prevents
#         from splitting compound words (mother-in-law, double-decker, etc.)
#     "Latin.corr": since some editions do not distinguish the letters v/u,
#         this option provides a consistent conversion to "u" in each text.

delete.pronouns = FALSE
corpus.lang = "English.all"


# Selection of features. In classical approaches, frequencies of the most
# frequent words (MFW) are used as basis for multidimensional analyses.
# It has been argued, however, that also other features are worh considering,
# especially word and/or letter n-grams. The general concept of n-gram
# is to combine a string of single words/letters into a sequence of n
# elements. Given a sample sentence "This is a simple example", the letter 
# 2-grams are as follows: "th", "hi", "is", "s ", " i", "is", "s ", " a", "a ",
# " s", "si", "im", "mp", etc. The same sentence split into word 3-grams:
# "this is a", "is a simple", "a simple example".
# Another question is whether it really increases the accuracy of attribution;
# further reading: Eder, M. (2011). Style-markers in authorship attribution: 
# A cross-language study of the authorial fingerprint, "Studies in Polish
# Linguistics" 6: 101-16.
# Two types of n-grams are available: letters (option "l"), and words ("w").

analyzed.features = "w"
ngram.size = 1



#######  MATHEMATICAL SETTINGS (CLASSIFICATION METHOD)  #############


# method of classification: choose one of the options described below
# Delta ("delta"), k-nearest neighbor classification ("knn"),
# Naive Bayes classification ("naivebayes"), Nearest Shrunken Centroids
# ("nsc"), or Support Vectors Machines ("svm")
classification.method = "knn"



#######  MATHEMATICAL SETTINGS (DISTANCE MEASURE)  #################

# Strictly speaking, the choice of an appropriate distance measure
# is the core of the statistical procedure provided by this script.
# (However, the distance measures do not apply to the PCA method)
# Although this choice is not easy, some of the following measures
# seem to be more suitable for linguistic purposes than others.
# On theoretical grounds, Euclidean Distance and Manhattan
# Distance should be avoided in stylometry. Canberra Distance is quite 
# troublesome but effective e.g. for Latin (it should be combined with 
# careful culling settings and a limited number of MFW taken into analysis). 
# For English, usually Classic Delta is a good choice. A theoretical 
# explanation of the measures implemented in this script is forthcoming (?).
#
# The available distance measures (choose ONE) are as follows:
#   "CD" --> Classic Delta as developed by Burrows
#   "AL" --> Argamon's Linear Delta (based on Euclidean principles)
#   "ED" --> Eder's Delta (explanation and mathematical equation: soon)
#   "ES" --> Eder's Simple (explanation and mathematical equation: soon)
#   "MH" --> Manhattan Distance (obvious and well documented)
#   "CB" --> Canberra Distance (risky, but sometimes amazingly good)
#   "EU" --> Euclidean Distance (basic, and the most "natural")

distance.measure = "CD"


########  VISUALIZATION METHODS, LOGS, REPORTS, DISPLAY OPTIONS  ############

# Theoretically, one can switch on every option at once (not much sense 
# in that). This will work, because the script will treat the last switched on 
# option as your actual choice. You may also want to switch off each option 
# (then, the logfile will be the only output).

# Cluster Analysis of Delta distance table;
#   this option makes sense if there is only a single iteration (or only a few).
# Multidimensional Scaling of Delta distance table;
#   as above: although possible with multiple iterations, it is unwise.
# Principal Component Analysis: either using covariance matrix,
#   or correlation matrix -- the latter might be more convincing
# Bootstrap procedure: multiple iterations will build a consensus tree
#   ATTENTION: this option requires the ape library!!!

#cluster.analysis = FALSE
#multidimensional.scaling = FALSE
#pca.covariance.table = FALSE
#pca.correlation.table = FALSE
#make.consensus.tree = FALSE
consensus.strength=0.5

# Delta is always active: output is directed to a file. You may specify
# the number of final ranking candidates to be displayded (at least 1)

number.of.candidates = 3

# Do you want to display the graph on the screen?
# Do you want to write the graph directly to a graphics file? Which format?

#display.on.screen = TRUE
#write.pdf.file = FALSE
#write.jpg.file = FALSE
#write.emf.file = FALSE    # Windows only
#write.png.file = FALSE

# Do you want the graphs coloured? Do you want titles on the graphs?

#use.color.graphs = TRUE
#titles.on.graphs = TRUE

# Layout of dendrogram: horizontal/vertical (Cluster Analysis only)

#dendrogram.layout.horizontal = TRUE

# Report the number of correct guesses for each iteration (written to 
# the log file). Ranking of the least unlikely candidates in the log file.

how.many.correct.attributions = TRUE
final.ranking.of.candidates = TRUE


#######  ADVANCED SETTINGS (FOR EXPERTS ONLY)  ########################

# Normally, the script is computing a big table of thousands 
# of word frequencies. This is a non-trivial and time-consuming task.
# If done once, there is no need to waste time and do it again, because
# the tables are also written into output files. To retrieve all the word
# frequencies from existing files, switch this option to TRUE.
# BUT it MUST be set to FALSE when you switch corpora in the same R session!

use.existing.freq.tables = TRUE

# Some people like to see what's going on, and to be able to revise/edit
# the list of words for analysis. To meet their wishes, the script
# saves the list into a separate output file. You can either delete as many 
# words as you want from this file, or mark the unwanted words with "#" 
# (just like these comments are marked). Switching the following option on
# prevents the script from overwriting the file, and provides that the wordlist
# is loaded from there.

use.existing.wordlist = TRUE

# Usually, it is recommended to cut off the tail of the word-list;
# if you do not want to cut the list, then the variable may be set to an 
# absurdly big number, or to "mfw.list.cutoff = mfw.list.of.all"
# (and then you are advised to use a fast computer)

mfw.list.cutoff = 5000

# How the z-scores should be calculated:
# if the variable is set to FALSE, then the z-scores are relying
# on the primary set only (this should be better in most cases; after all,
# this is the classical solution used by Burrows and Hoover).
# Otherwise, the scaling is based on all the values
# in the primary and the secondary sets.

z.scores.of.all.samples = FALSE

# The both talbes of frequencies are build using the pre-prepared word
# list of the whole I set. Alternatively, one might want to prepare 
# this list of both sets. Similarily culling: it can be calcutated either 
# on the I set, or on both sets

reference.wordlist.of.all.samples = FALSE
culling.of.all.samples = TRUE


# file with the final ranking of Delta results (log file)

outputfile = "final_results.txt"

# pronouns (and other words) to be deleted

pol.pronouns = c("ci", "ciebie", "cię", "go", "ich", "im", "ja", "ją", "je", "jego", "jej", "jemu", "ma", "mą", "me", "mego", "mej", "memu", "mi", "mną", "mnie", "moi", "moich", "moim", "moimi", "moja", "moją", "moje", "mojego", "mojej", "mojemu", "mój", "mu", "my", "mych", "mym", "mymi", "nam", "nami", "nas", "nią", "nich", "nie", "niego", "niej", "niemu", "nim", "nimi", "on", "ona", "one", "oni", "ono", "swa", "swą", "swe", "swego", "swej", "swemu", "swoi", "swoich", "swoim", "swoimi", "swoja", "swoją", "swoje", "swojego", "swojej", "swojemu", "swój", "swych", "swym", "swymi", "tobą", "tobie", "twa", "twą", "twe", "twego", "twej", "twemu", "twoi", "twoich", "twoim", "twoimi", "twoja", "twoją", "twoje", "twojego", "twojej", "twojemu", "twój", "twych", "twym", "twymi", "ty", "wam", "wami", "was", "wy", "wasz", "wasza", "wasze", "waszym", "waszymi", "waszych", "waszego", "waszej", "waszą")
eng.pronouns = c("he", "her", "hers", "herself", "him", "himself", "his", "i", "me", "mine", "my", "myself", "our", "ours", "ourselves", "she", "thee", "their", "them", "themselves", "they", "thou", "thy", "thyself", "us", "we", "ye", "you", "your", "yours", "yourself")
lat.pronouns = c("ea", "eae", "eam", "earum", "eas", "ego", "ei", "eis", "eius", "eo", "eorum", "eos", "eum", "id", "illa", "illae", "illam", "illarum", "illas", "ille", "illi", "illis", "illius", "illo", "illorum", "illos", "illud", "illum", "is", "me", "mea", "meae", "meam", "mearum", "meas", "mei", "meis", "meo", "meos", "meorum", "meum", "meus", "mihi", "nobis", "nos", "noster", "nostra", "nostrae", "nostram", "nostrarum", "nostras", "nostri", "nostris", "nostro", "nostros", "nostrorum", "nostrum", "sua", "suae", "suam", "suarum", "suas", "sui", "suis", "suo", "suos", "suorum", "suum", "suus", "te", "tibi", "tu", "tua", "tuae", "tuam", "tuarum", "tuas", "tui", "tuis", "tuo", "tuos", "tuorum", "tuum", "tuus", "vester", "vestra", "vestrae", "vestram", "vestrarum", "vestras", "vestri", "vestris", "vestro", "vestros", "vestrorum", "vestrum", "vobis", "vos")
fra.pronouns = c("je", "me", "moi", "tu", "te", "toi", "il", "elle", "le", "la", "lui", "se", "lui", "elle", "soi", "nous", "vous", "ils", "elles", "les", "leur", "se", "eux", "elles", "soi")
ger.pronouns = c("ich", "mich", "mir", "mein", "meine", "meiner", "meines", "du", "dich", "dir", "dein", "deine", "deiner", "deines", "er", "sich", "ihr", "ihrer", "ihn", "ihnen", "sein", "seiner", "seines", "seine", "sie", "wir", "uns", "unser", "unsere", "euch", "eure", "euer")
ita.pronouns = c("ci", "gli", "io", "la", "le", "lei", "li", "loro", "lo", "lui", "me", "mi", "noi", "si", "te", "ti", "tu", "vi", "voi", "egli", "ella", "esso", "essa", "essi", "esse", "mio", "mia", "miei", "mie", "tuo", "tua", "tuoi", "tue", "suo", "sua", "suoi", "sue", "nostro", "nostra", "nostri", "nostre", "vostro", "vostra", "vostri", "vostre", "loro", "loro", "loro", "loro")
hun.pronouns = c("annak", "az", "azzal", "bele", "belé", "beléd", "beléje", "beléjük", "belém", "belénk", "belétek", "belöle", "belőled", "belőlem", "belőletek", "belőlük", "belőlünk", "benne", "benned", "bennem", "bennetek", "bennük", "bennünk", "én", "ennek", "enyéim", "enyém", "enyémek", "érte", "érted", "értem", "értetek", "értük", "értünk", "ez", "ezzel", "hozzá", "hozzád", "hozzája", "hozzájuk", "hozzám", "hozzánk", "hozzátok", "maga", "magáé", "magáéi", "maguk", "maguké", "magukéi", "mi", "mieink", "mienk", "miénk", "nála", "nálad", "nálam", "nálatok", "náluk", "nálunk", "neked", "nekem", "neki", "nekik", "nektek", "nekünk", "ő", "ők", "ön", "öné", "önéi", "önnek", "önnel", "önök", "önöké", "önökéi", "önökkel", "önöknek", "övé", "övéi", "övéik", "övék", "rád", "rája", "rajta", "rajtad", "rajtam", "rajtatok", "rajtuk", "rajtunk", "rájuk", "rám", "ránk", "rátok", "róla", "rólad", "rólam", "rólatok", "róluk", "rólunk", "te", "ti", "tied", "tiéd", "tieid", "tieitek ", "tietek", "tiétek", "tőle", "tőled", "tőlem", "töletek", "tőlük", "tőlünk", "vele", "veled", "velem", "veletek", "velük", "velünk")

# when analyzed texts are significantly unequal in length, it is not a bad
# idea to prepare samples as randomly chosen "bags of words". If this option
# is switched on, the desired size of a sample should be indicated.
# Sampling with and without replacement is also available.
# (Further reading: Eder, M. (2010). Does Size Matter? Authorship Attribution,
# Short Samples, Big Problem. In "Digital Humanities 2010: Conference 
# Abstracts." King's College London 2010, pp. 132-35.)
#
# ATTENTION: this makes sense only if "use.existing.freq.tables" is set "FALSE"

random.sampling = FALSE
length.of.random.sample = 10000
sampling.with.replacement = FALSE

# the variables are now ready to use (unless the GUI option was chosen)
# ###################################################################








# #################################################
# checking some of the initial variables -- just in case
# #################################################

# Given a language option ("English", "Polish", "Latin" etc., as described 
# above), this procedure selects one of the lists of pronouns
# If no language was chosen (or if a desired language is not supported, or if 
# there was a spelling mistake), then the variable will be set to "English". 
# If "Pronouns deleted" is set to FALSE, this is immaterial.

if(exists("pronouns") == FALSE) # checking if the "pronouns" box is empty
    pronouns = eng.pronouns


# This prevents us from choosing a non-existing distance measure -- in such
# case the default distance (Classic Delta) will be switched on. Be aware
# of correct spelling: then the default value will be assigned as well!

if(distance.measure %in% c("CD","AL","ED","ES","MH","CB","EU") == FALSE) {
  distance.measure = "CD"
  }

classification.method = tolower(classification.method)


# #################################################




if(file.exists("config.txt") == TRUE) {
  source("config.txt") }


# #############################################################################
# Final settings (you are advised rather not to change them)
# #############################################################################


# A chosen language option should be followed by an assignment of 
# the appropriate set of pronouns. The following code is responsible for it
  if(corpus.lang == "English")
      pronouns = eng.pronouns
  if(corpus.lang == "Polish")
      pronouns = pol.pronouns 
  if(corpus.lang == "Latin")
      pronouns = lat.pronouns
  if(corpus.lang == "French")
      pronouns = fra.pronouns
  if(corpus.lang == "German" )
      pronouns = ger.pronouns
  if(corpus.lang == "Italian")
      pronouns = ita.pronouns
  if(corpus.lang == "Hungarian")
      pronouns = hun.pronouns

# Since it it not so easy to perform, say, 17.9 iterations, or analyze
# 543.3 words, the code below justifies all numerical variables, to prevent 
# you from your stupid jokes with funny settings. (OK, it is still
# possible to crash the script but we will not give you a hint)
  mfw.min = round(mfw.min)
  mfw.max = round(mfw.max)
  mfw.incr = round(mfw.incr)
  start.at = round(start.at)
  culling.min = round(culling.min)
  culling.max = round(culling.max)
  culling.incr = round(culling.incr)
  mfw.list.cutoff = round(mfw.list.cutoff)

# This also prevents from unexpected settings
if(number.of.candidates < 1) {
  number.of.candidates = 1
  number.of.candidates = round(number.of.candidates)
  }


# Finally, we want to save some variable values for later use
cat("",file="config.txt",append=F)
var.name <- function(x) { 
      if(is.character(x)==TRUE) {
      cat(paste(deparse(substitute(x)),"=\"",x,"\"", sep=""),file="config.txt",sep="\n",append=T)
        } else {
          cat(paste(deparse(substitute(x)),x, sep="="),file="config.txt",sep="\n",append=T) }
        }

 var.name(corpus.format)
 var.name(corpus.lang)
 var.name(analyzed.features)
 var.name(ngram.size)
 var.name(random.sampling)
 var.name(length.of.random.sample)
 var.name(classification.method)
 var.name(mfw.min)
 var.name(mfw.max)
 var.name(mfw.incr)
 var.name(start.at)
 var.name(mfw.list.cutoff)
 var.name(culling.min)
 var.name(culling.max)
 var.name(culling.incr)
 var.name(delete.pronouns)
 var.name(culling.of.all.samples)
 var.name(final.ranking.of.candidates)
 var.name(how.many.correct.attributions)
 var.name(use.existing.freq.tables)
 var.name(use.existing.wordlist)
 var.name(distance.measure)
 var.name(number.of.candidates)
 var.name(z.scores.of.all.samples)
 var.name(reference.wordlist.of.all.samples)


# #############################################################################
# #############################################################################






# #################################################
# FUNCTIONS:




# #################################################
# Function for combining single features (words
# or letters) into n-grams, or strings of n elements;
# e.g. letter 2-grams of the sentence "This is a sentence"
# are as follows: "th", "hi", "is", "s ", " i", "is", etc.
# Required argument: name of the vector of words/letters
# #################################################

make.ngrams = function(input.text) {
  txt = c()
  if(ngram.size > 1) {
    txt = input.text
    for(n in 2:ngram.size) {
    txt = paste(txt[1:(length(txt)-1)],input.text[n:length(input.text)])
    }
  } else {
  # if n-gram size is set to 1, then nothing will happen
  txt = input.text
  }
return(txt)
}




# #################################################
# Generic function for splitting a given input text into
# single words (chains of characters delimited with
# spaces or punctuation marks). Alternatively, 
# you can write here another rule for splitting.
# Required argument: name of the text to be split.
# ATTENTION: this is the only piece of coding in this script
# that dependens on the operating system used
# #################################################

split.into.words = function(input.text) {
  # splitting into units specified by regular expression; here, 
  # all sequences between non-letter characters are assumed to be words:
  ### Linux, Mac
  #tokenized.txt = c(unlist(strsplit(input.text, "[^[:alpha:]]+")))
  ### Windows
  tokenized.txt = c(unlist(strsplit(input.text, "\\W+",perl=T)))
return(tokenized.txt)
}





# #################################################
# Function for splitting a given input text into
# single words (chains of characters delimited with
# spaces or punctuation marks). There is also an option
# of splitting the text into letters and/or performing
# splitting into n-grams
# #################################################

split.sample = function(input.text) {
  # loading the file; optionally, fiddling with dashes and contractions:
  #
  # this is the standard procedure of splitting input texts
  if(corpus.lang != "English.contr" && corpus.lang != "English.all") {
    tokenized.sample = split.into.words(input.text)
    }
  # if Latin option with adjusting the v/u letters was switched on,
  # smashing the distinction and converting both types to the letter u
  if(corpus.lang == "Latin.corr") {
    tokenized.sample = gsub("v","u",tokenized.sample)
    }    
  # this code is used for English corpora only
  if(corpus.lang == "English.contr" || corpus.lang == "English.all") {
    # replacing non-ASCII apostrophes with simple ' (standard ASCII char)
    tokenized.sample = gsub("’","'",input.text)
    # getting rid of contractions ('t, 's, 've, 'd, 'll, 'em, 'im) by replacing
    # their apostrophes with ^ (other apostrophes will not be replaced)
    tokenized.sample = gsub("([[:alpha:]])'([tsdm]|ll|ve|em|im)\\b","\\1^\\2",
                            tokenized.sample)
    # adding spaces around dashes (to distinguish dashes and hyphens)
    tokenized.sample = gsub("[-]{2,5}"," -- ",tokenized.sample)
    # depending on which option was swithed on, either the contractions are
    # kept, or all the peculiarities, i.e. both contractions and hyphens
    if(corpus.lang == "English.contr") {
      tokenized.sample=c(unlist(strsplit(tokenized.sample,"[^[:alpha:]^]+")))
      }
    if(corpus.lang == "English.all") {
      tokenized.sample=c(unlist(strsplit(tokenized.sample,"[^[:alpha:]^-]+")))
      # trying to clean the remaining dashes:
      tokenized.sample = gsub("^[-]+$","",tokenized.sample)
      }
  }
  # trying to avoid empty strings:
  tokenized.sample = tokenized.sample[nchar(tokenized.sample)>0]
  # trying to get rid of non-letter characters:
  tokenized.sample = tokenized.sample[grep("[^[:digit:]]",tokenized.sample)]
  #
  #
  # splitting the sample into letters (if analyzed.features was set to "l")
  if(analyzed.features == "l") {
    tokenized.sample = paste(tokenized.sample, collapse=" ")
    tokenized.sample = unlist(strsplit(tokenized.sample,""))
    }
  #
  # making n-grams (if the value "n" has been set to 2 or more) 
  if(ngram.size > 1) {
  tokenized.sample = make.ngrams(tokenized.sample)
  }
# the result of the function:
return(tokenized.sample)
}



# #################################################
# Function for adjusting different input formats:
# xml (TEI) in two variants, html, and plain text files.
# Required argument: name of the text to pre-process
# #################################################

delete.markup = function(input.text) {
  if(corpus.format == "xml" || corpus.format == "xml.drama") {
    # getting rid of TEI header (if exists)
    if(length(grep("</teiheader>",input.text)) > 0) {
      input.text = input.text[-c(1:(grep("</teiheader>",input.text)))]
      }
    # the whole text into one (very) long line
    preprocessed.text = paste(input.text, collapse=" ")
    # getting rid of dramatis personae
    if(corpus.format == "xml.drama"){
      preprocessed.text = gsub("<speaker>.*?</speaker>","",preprocessed.text)
      }
    # getting rid of comments and (editorial) notes
    preprocessed.text = gsub("<note.*?</note>","",preprocessed.text)
    # getting rid of all the remaining tags
    preprocessed.text = gsub("<.*?>","",preprocessed.text)
  }
  if(corpus.format == "html") {
    # getting rid of html header (if exists)
    if(length(grep("<body",input.text)) > 0) {
      input.text = input.text[-c(1:(grep("<body",input.text)))]
      }
    # the whole text into one (very) long line
    preprocessed.text = paste(input.text, collapse=" ")
    # getting rid of links (menus and similar stuff should be deleted, hopefully)
    preprocessed.text = gsub("<a href.*?/a>","",preprocessed.text)
    # getting rid of all the remaining tags
    preprocessed.text = gsub("<.*?>","",preprocessed.text)
  } else {
  preprocessed.text = input.text
  }
return(preprocessed.text)
}




# #################################################
# Function for preparing and printing
# the final ranking of the least unlikely authors
# for each text in a given table of distances.
# Arguments: 1. table with distances, 2. number of candidates to be displayed
# #################################################

make.ranking.of.candidates = function(dist.matrix,candidates) {
  for(h in 1:length(dist.matrix[,1])) {
  ranked.candidates = order(dist.matrix[h,])
  current.sample = c(gsub("_.*","",colnames(dist.matrix)))[ranked.candidates]
    if((gsub("_.*","",rownames(dist.matrix)))[h] != current.sample[1]) {
  	cat(c(c(rownames(dist.matrix))[h]," ", "-->", " ", 
      current.sample[1:candidates], " (delta score:", 
      round(dist.matrix[h,ranked.candidates[1:candidates]],4), ")",
      "\n"),file=outputfile,append=T)
    }
  }
}



# #################################################
# Function for preparing and printing
# the number of correct attributions
# this is a variant of the above function make.ranking.of.candidates
# Argument: table of distances
# #################################################

make.number.of.correct.attributions = function(dist.matrix) {
  corr.attrib = 0
  for(h in 1:length(dist.matrix[,1])) {
  ranked.c = order(dist.matrix[h,])
  current.sample = c(gsub("_.*","",colnames(dist.matrix)))[ranked.c]
    if((gsub("_.*","",rownames(dist.matrix)))[h] == current.sample[1]) {
    corr.attrib = corr.attrib + 1
    }
  }
# the result of the function:
return(corr.attrib)
}

# #############################################################################




# #################################################
# the module for loading a corpus from text files;
# it can be omitted if the frequency table for
# both primary and secondary sets already exist
# (then "use.existing.freq.tables" should be set 
# to TRUE in the preamble of the script/GUI)
# #################################################
#
#
#
# Checking: (1) whether produce new frequency tables or use existing ones;
# (2) whether the tables are stored in memory or written into files.
# If you have chosen using the existing tables and there are no such tables
# available, then your choice will be ignored
if(use.existing.freq.tables == TRUE 
            && file.exists("freq_table_primary_set.txt") == TRUE
            && file.exists("freq_table_secondary_set.txt") == TRUE
            ) { 
      if(exists("freq.I.set.0.culling") && exists("freq.II.set.0.culling")) {
      cat("\n", "using frequency tables stored as variables...", "\n")
        } else {
          cat("\n", "reading files with frequency tables...", "\n")
          freq.I.set.0.culling = t(read.table("freq_table_primary_set.txt"))
          freq.II.set.0.culling = t(read.table("freq_table_secondary_set.txt"))
          cat("\n", "frequency tables loaded successfully", "\n\n")
        }
      # extracting names of the samples
      filenames.primary.set = rownames(freq.I.set.0.culling)
      filenames.secondary.set = rownames(freq.II.set.0.culling)
      #
      # checking whether existing wordlist should be used as well
      if (use.existing.wordlist == TRUE && file.exists("wordlist.txt") == TRUE){
          cat("\n", "reading a wordlist from file...", "\n")
          mfw.list.of.all = scan("wordlist.txt",what="char",sep="\n")
          mfw.list.of.all = c(grep("^[^#]",mfw.list.of.all,value=TRUE))
          #
          # adjusting the size of frequency tables with the existing wordlist
          freq.I.set.0.culling = 
                       freq.I.set.0.culling[,colnames(freq.I.set.0.culling) 
                       %in% mfw.list.of.all]
          freq.II.set.0.culling = 
                       freq.II.set.0.culling[,colnames(freq.II.set.0.culling) 
                       %in% mfw.list.of.all]
      } else {
          # the wordlist will be created from existing tables with frequencies
          mfw.list.of.all = colnames(freq.I.set.0.culling)
          # some comments into the file containing wordlist
          cat("# This file contains the words that were used in the table",
          "# of frequencies uploaded from an external file. The current list",
          "# can be used for the next tasks, and for this purpose it can be",
          "# manually revised, edited, deleted, culled, etc.", 
          "# You can either delete unwanted words, or mark them with \"#\"",
          "# -----------------------------------------------------------------",
          "",
          file="wordlist.txt", sep="\n")
          # the current wordlist into a file
          cat(mfw.list.of.all, file="wordlist.txt", sep="\n",append=F)
        }
# if existing tables will not be used, then begin producing new tables
  } else {
#
# Retrieving the names of samples
#
filenames.primary.set = list.files("primary_set")
filenames.secondary.set = list.files("secondary_set")
#
# Checking whether required files and subdirectories exist
if(file.exists("primary_set")==FALSE || file.exists("secondary_set")==FALSE) {
    cat("\n\n !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n",
    "Working directory should contain two subdirectories: 
    \"primary_set\" and \"secondary_set\"\n",
    "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n")
    stop("corpus prepared incorrectly")
    }
if(length(filenames.primary.set) < 2 || length(filenames.secondary.set) < 2) {
    cat("\n\n !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n",
    "Both subdirectories \"primary_set\" and \"secondary_set\"
    should contain at least two text samples!\n",
    "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n")
    stop("corpus prepared incorrectly")
    }
#
#
#
# loading the primary set from text files
corpus.of.primary.set = list()
setwd("primary_set")
  for (file in filenames.primary.set) {
  # loading the next file from the list filenames.primary.set,
  current.file = tolower(scan(file,what="char",sep="\n", quiet=T))
  # delete xml/html markup (if applicable)
  current.file = delete.markup(current.file)
  # deleting punctuation, splitting into words:
  split.file = split.sample(current.file)
  # if the current text is too short, abort the script
  if(length(split.file) < 10) {
    cat("\n\n",file, "\t", "this sample is VERY short!", "\n\n")
    setwd(".."); stop("corpus error")
    }
  # appending the current text to the virtual corpus
  corpus.of.primary.set[[file]] = split.file
  cat(file, "\t", "loaded successfully", "\n")
  }
setwd("..")
#
# loading the secondary set from text files
corpus.of.secondary.set = list()
setwd("secondary_set")
  for (file in filenames.secondary.set) {
  # loading the next file from the list filenames.secondary.set,
  current.file = tolower(scan(file,what="char",sep="\n", quiet=T))
  # delete xml/html markup (if applicable)
  current.file = delete.markup(current.file)
  # deleting punctuation, splitting into words:
  split.file = split.sample(current.file)
  # if the current text is too short, abort the script
  if(length(split.file) < 10) {
    cat("\n\n",file, "\t", "this sample is VERY short!", "\n\n")
    setwd(".."); stop("corpus error")
    }
  # appending the current text to the virtual corpus
  corpus.of.secondary.set[[file]] = split.file
  cat(file, "\t", "loaded successfully", "\n")
  }
setwd("..")
# blank line on the screen
cat("\n")
#
#
#
# both directories (primary_set and secondary_set) shoud contain some texts;
# if the number of text samples is lower than 2, the script will stop
if(length(corpus.of.primary.set) < 2 || length(corpus.of.secondary.set) < 2) {
    cat("\n\n","either primary_set or secondary_set is empty!", "\n\n")
    stop("corpus error")
    }
#
#
# We need a list of the most frequent words used in the current corpus, 
# in descending order, without frequencies (just a list of words). It can be 
# either loaded from a file (then set the option "use.existing.wordlist=TRUE"), 
# or created by the code provided below:
#
if (use.existing.wordlist == TRUE && file.exists("wordlist.txt") == TRUE) {
          cat("\n", "reading a wordlist from file...", "\n")
          mfw.list.of.all = scan("wordlist.txt",what="char",sep="\n")
          mfw.list.of.all = c(grep("^[^#]",mfw.list.of.all,value=TRUE))
} else {
# Extracting all the words used in the texts of primary set 
# (or both if "Z-scores all" is set to TRUE)
#
wordlist.of.primary.set = c()
  for (file in 1 : length(corpus.of.primary.set)) {
    # loading the next sample from the list filenames.primary.set,
    current.text = corpus.of.primary.set[[file]]
    # putting samples together:
    wordlist.of.primary.set = c(wordlist.of.primary.set, current.text)
    cat(names(corpus.of.primary.set[file]),"\t","tokenized successfully", "\n")
    }
# including words of the secondary set in the reference wordlist (if specified)
  if (reference.wordlist.of.all.samples == TRUE) {
    wordlist.of.secondary.set = c()
    for (file in 1 : length(corpus.of.secondary.set)) {
      # loading the next sample from the list filenames.secondary.set,
      current.text = corpus.of.secondary.set[[file]]
      # putting samples together:
      wordlist.of.secondary.set = c(wordlist.of.secondary.set, current.text)
    cat(names(corpus.of.secondary.set[file]),"\t","tokenized successfully","\n")
    } 
  } else {
  wordlist.of.secondary.set = c()}
#
#
# preparing a sorted frequency list of the whole primary set
mfw.list.of.all = sort(table(
                  c(wordlist.of.primary.set,wordlist.of.secondary.set)),
                  decreasing=T)
  # if the whole list is long, then cut off the tail (e.g., > 5000 mfw)
  if (length(mfw.list.of.all) > mfw.list.cutoff) {
  mfw.list.of.all = mfw.list.of.all[1:mfw.list.cutoff]
  }
# the only thing we need are words ordered by frequency (no frequencies)
mfw.list.of.all = names(mfw.list.of.all)
#
# some comments into the file containing wordlist
cat("# This file contains the words that were used for building the table",
  "# of frequencies. It can be also used for the next tasks, and for this",
  "# purpose it can be manually revised, edited, deleted, culled, etc.", 
  "# You can either delete unwanted words, or mark them with \"#\"",
  "# -----------------------------------------------------------------------",
  "",
      file="wordlist.txt", sep="\n")
# the current wordlist into a file
cat(mfw.list.of.all, file="wordlist.txt", sep="\n",append=F)
#
}   # <----- conditional expr. "use.existing.wordlist" terminates here
#
# 
#
#
# blank line on the screen
cat("\n")
#
#
# #################################################
# FUNCTION: make.paralel.frequency.lists()
# preparing a huge table with all the frequencies (> mwf.list.cutoff).
# Two arguments are required -- a vector with filenames
# and a specified variable where the corpus is stored (in a list)
# #################################################
#
make.paralel.frequency.lists = function(filenames,current.corpus) {
  freq.list.of.all.the.samples = c()
  freq.list.of.current.sample = c()
    for (file in filenames) {
    # loading the next sample from the list filenames.primary.set,
    current.sample = current.corpus[[file]]
    #
    # if random sampling was chosen, the text will be randomized and 
    # a sample of a given lenght will be excerpted
    if(random.sampling == TRUE) {
    current.sample = head(sample(current.sample,
                        replace = sampling.with.replacement), 
                        length.of.random.sample)
    }
    #
    #
    # preparing the frequency list of the current sample
    raw.freq = table(current.sample) * 100 / length(current.sample)
    # adjusting the frequency list to the main MFW list obtained above
    freq.list.of.current.sample = raw.freq[mfw.list.of.all]
    # taking the names (sc. words) from the main MFW list 
    names(freq.list.of.current.sample) = mfw.list.of.all
    # and sticking the current sample into the general frequency table
    freq.list.of.all.the.samples = 
               rbind(freq.list.of.all.the.samples, freq.list.of.current.sample)
    # a short message on the screen:
    cat(file, "\t", "excerpted successfully", "\n")
  }
  # adjusting names of the rows (=samples)
  rownames(freq.list.of.all.the.samples) = c(filenames)
# the result of the function
return(freq.list.of.all.the.samples)
}
#
#
# preparing a huge table of all the frequencies for the primary set
freq.I.set.0.culling = 
  make.paralel.frequency.lists(filenames.primary.set,corpus.of.primary.set)
# all NA values will be adjusted to 0
freq.I.set.0.culling[which(is.na(freq.I.set.0.culling))] = 0
#
#
# preparing a huge table of all the frequencies for the secondary set
freq.II.set.0.culling = 
  make.paralel.frequency.lists(filenames.secondary.set,corpus.of.secondary.set)
# all NA values will be adjusted to 0
freq.II.set.0.culling[which(is.na(freq.II.set.0.culling))] = 0
#
#
# writing the frequency tables to text files (they can be re-used!)
write.table(t(freq.I.set.0.culling), 
            file="freq_table_primary_set.txt", 
            sep="\t",
            row.names=TRUE,
            col.names=TRUE)
write.table(t(freq.II.set.0.culling), 
            file="freq_table_secondary_set.txt", 
            sep="\t",
            row.names=TRUE,
            col.names=TRUE)
#
}  # <----- conditional expr. "use.existing.freq.tables" terminates here
#
#
# #################################################
# the module for loading the corpus terminates here
# #################################################







# #################################################
# MAIN PROGRAM; the main loop is below
# #################################################


# cleaning the outputfile
cat("",file=outputfile,append=F)

# saving the original mfw.max value in mfw.max.original
# this is useful for subtitles of bootstrap graphs
mfw.max.original = mfw.max

# the general counter for different purposes: initiation
number.of.current.iteration = 0

# useful for diagnostic reasons; this will be reported in the logfile
total.no.of.correct.attrib = c()
total.no.of.possible.attrib = c()

# retrieving the total number of texts to be "guessed"
# (anonymous texts and unique authorial samples will not be counted)
authors.I.set = c(gsub("_.*","",rownames(freq.I.set.0.culling)))
authors.II.set = c(gsub("_.*","",rownames(freq.II.set.0.culling)))
perfect.guessing = length(authors.II.set[authors.II.set %in% authors.I.set])

# load the ape library; make an empty bootstrap.results list
# this will be executed only if the bootstrap option was checked
#if (make.consensus.tree == TRUE) {
#    library(ape)
#    bootstrap.list = list()
#    }






# #################################################
# module for culling (THE MAIN LOOP IN THE PROGRAM)
# #################################################
# #################################################
# #################################################
# module for culling (THE MAIN LOOP IN THE PROGRAM)
# #################################################


# testing if desired culling settings are acceptable;
# if too large, it is set to maximum possible
  if(culling.max > 100) {
  culling.max = 100
  }
# if too small, it is set to 0 (i.e. minimal value)
  if(culling.min < 0) {
  culling.min = 0
  }
# avoiding infinite loops
  if(culling.incr <= 1) {
  culling.incr = 10
  }

# #################################################



for(j in (culling.min/culling.incr):(culling.max/culling.incr)) {
current.culling = j * culling.incr

# the beginning of the culling procedure (this is to be done 
# on the primary set only; the secondary set is using the same list!)
#
raw.list.after.culling = c()

# extracting non-zero values from primary set frequency table,
# or from both frequency tables (if specified)
if(culling.of.all.samples == FALSE) {
  nonzero.values = freq.I.set.0.culling > 0
  } else {
  nonzero.values = rbind(freq.I.set.0.culling,freq.II.set.0.culling) > 0
  }

# counting of how many non-zero values there are
for (y in 1: length(nonzero.values[1,])) {
  raw.list.after.culling = c(raw.list.after.culling, 
              (length(grep("TRUE",nonzero.values[,y])) / 
                     length(nonzero.values[,y])) 
                           >= current.culling/100 
                           )
  }
# a raw culling list has no word-identification; let's change it:
names(raw.list.after.culling) = colnames(freq.I.set.0.culling)

# a simple sequence of words which were not culled
list.of.words.after.culling = 
          c(names(raw.list.after.culling[grep("TRUE",raw.list.after.culling)]))


# procedure for deleting pronouns
if (delete.pronouns == TRUE) {
    list.of.words.after.culling = 
      list.of.words.after.culling[!(list.of.words.after.culling %in% pronouns)]
    }


# the above list-of-not-culled to be applied to both sets:
primary.set = freq.I.set.0.culling[,c(list.of.words.after.culling)]
    rownames(primary.set) = filenames.primary.set
secondary.set = freq.II.set.0.culling[,c(list.of.words.after.culling)]
    rownames(secondary.set) = filenames.secondary.set

# #################################################
# culling is done, but we are still inside the main loop







# starting the frequency list at frequency rank set in option start.at above

primary.set = primary.set[,start.at:length(primary.set[1,])]
secondary.set = secondary.set[,start.at:length(secondary.set[1,])]


# Testing if the desired MFW number is acceptable,
# if MFW too large, it is set to maximum possible.
  if(mfw.max > length(primary.set[1,])) {
  mfw.max = length(primary.set[1,])
  }
# if too small, it is set to 1 (i.e., minimal value)
  if(mfw.min < 1) {
  mfw.min = 1
  }
# if culling is too strong, sometimes strange things may happen; let's block it
  if(mfw.min > mfw.max) {
  mfw.min = mfw.max
  }
# MFW set to mfw.max for a while (it will change later on)
mfw = mfw.max



cat("\n")
cat("culling @ ", current.culling,"\t","available words ",mfw.max,"\n")




# #################################################
# z-scores calcutations
# #################################################


# mean and standard dev. for each word (in primary set)
primary.set.mean = c(sapply(as.data.frame(primary.set), mean))
primary.set.sd = c(sapply(as.data.frame(primary.set), sd))

# calculating z-scores for both I and II sets (a message on the screen)
cat("Calculating z-scores... \n\n")



# an additional table composed of relative word frequencies 
# of joint primary and secondary sets
freq.table.both.sets = rbind(primary.set[,1:mfw.max], secondary.set[,1:mfw.max])



# Entropy distance: experimental, but entirely available
# (the results do not really differ than for typical word frequencies)
#
#A = t(t(freq.table.both.sets + 1) / colSums(freq.table.both.sets + 1))
#B =t(t(log(freq.table.both.sets + 2)) / -(colSums(A * log(A))))
#freq.table.both.sets = B
#



# calculating z-scores either of primary set, or of both sets
if(z.scores.of.all.samples == FALSE) {
  # function for z-scores scaling executed for primary.set
  zscores.primary.set = scale(primary.set)
  rownames(zscores.primary.set) = rownames(primary.set)
  # function for z-scores scaling executed for secondary.set
  zscores.secondary.set = 
            scale(secondary.set, center=primary.set.mean, scale=primary.set.sd)
  rownames(zscores.secondary.set) = rownames(secondary.set)
  # the two tables with calculated z-scores should be put together
  zscores.table.both.sets = rbind(zscores.primary.set, zscores.secondary.set)
} else {
  # the z-scores can be calculated on both sets as alternatively  
  zscores.table.both.sets = scale(freq.table.both.sets)
  zscores.table.both.sets = zscores.table.both.sets[,]
  }










# #################################################
# the internal loop starts here (for i = mfw.min : mfw.max)
# #################################################

# a short message on the screen:
if(distance.measure == "CD") {
  cat("Calculating classic Delta distances... \n")
  }
if(distance.measure == "AL") {
  cat("Calculating Argamon's Delta distances... \n")
  }
if(distance.measure == "ED") {
  cat("Calculating Eder's Delta distances... \n")
  }
if(distance.measure == "ES") {
  cat("Calculating Eder's Simple distances... \n")
  }
if(distance.measure == "MH") {
  cat("Calculating Mahattan distances... \n")
  }
if(distance.measure == "CB") {
  cat("Calculating Canberra distances... \n")
  }
if(distance.measure == "EU") {
  cat("Calculating Euclidean distances... \n")
  }



for(i in (mfw.min/mfw.incr):(mfw.max/mfw.incr)) {
mfw = i * mfw.incr

# for safety reasons, if MFWs > words in samples
if(mfw > length(list.of.words.after.culling) ) {
  mfw = length(list.of.words.after.culling)
  }

# the general counter for different purposes
number.of.current.iteration = number.of.current.iteration + 1



# the current task (number of MFW currently analyzed) echoed on the screen
cat(mfw, " ")









# #################################################
# module for Delta
# #################################################

if(tolower(classification.method) == "delta") {

# calculating classic Delta distances
if(distance.measure == "CD") {
  distance.name.on.graph = "Classic Delta distance"
  distance.name.on.file = "Classic Delta"
  distance.table = 
            as.matrix(dist(zscores.table.both.sets[,1:mfw],
            method="manhattan")) / mfw
  }
# calculating Argamon's "Linear Delta"
if(distance.measure == "AL") {
  distance.name.on.graph = "Argamon's Delta distance"
  distance.name.on.file = "Argamon's Delta"
  distance.table = 
            as.matrix(dist(zscores.table.both.sets[,1:mfw],
            method="euclidean")) / mfw
  }
# calculating Delta distances with Eder's modifications
if(distance.measure == "ED") {
  distance.name.on.graph = "Eder's Delta distance"
  distance.name.on.file = "Eder's Delta"
  zscores.plus.e.value = t(t(zscores.table.both.sets[,1:mfw])*((1+mfw:1)/mfw))
  distance.table = as.matrix(dist(zscores.plus.e.value,method="manhattan"))
  }
# calculating Eder's Simple distance to a matrix distance.table
if(distance.measure == "ES") {
  distance.table = 
         as.matrix(dist(sqrt(freq.table.both.sets[,1:mfw]),method="manhattan"))
  distance.name.on.graph = "Eder's Simple distance"
  distance.name.on.file = "Eder's Simple"
  }
# calculating Manhattan distance to a matrix distance.table
if(distance.measure == "MH") {
  distance.name.on.graph = "Manhattan distance"
  distance.name.on.file = "Manhattan"
  distance.table = 
           as.matrix(dist(freq.table.both.sets[,1:mfw],method="manhattan"))
  }
# calculating Canberra distance to a matrix distance.table
if(distance.measure == "CB") {
  distance.name.on.graph = "Canberra distance"
  distance.name.on.file = "Canberra"
  distance.table = 
           as.matrix(dist(freq.table.both.sets[,1:mfw],method="canberra"))
  }
# calculating Euclidean distance to a matrix distance.table
if(distance.measure == "EU") {
  distance.name.on.graph = "Euclidean distance"
  distance.name.on.file = "Euclidean"
  distance.table = 
           as.matrix(dist(freq.table.both.sets[,1:mfw],method="euclid"))
  }
# replaces the names of the samples (the extension ".txt" is cut off)
rownames(distance.table)=gsub("\\.txt$","",rownames(zscores.table.both.sets))
colnames(distance.table)=gsub("\\.txt$","",rownames(zscores.table.both.sets))

# #################################################
# extracting candidates, drawing, printing, etc.

# a selected area of the distance.table is needed, with colnames()
no.of.possib = length(primary.set[,1])
no.of.candid = length(secondary.set[,1])
selected.dist = 
          as.matrix(distance.table[no.of.possib+1:no.of.candid,1:no.of.possib])

}  # <--- delta


# tu kod dla SVM, kNN, NSC, NaiveBayes
#
#
#
#
#
#
distance.name.on.graph = "to be deleted"





if(tolower(classification.method) == "knn") {
  #kNN classification:
  library(class)
  #
  # training_set and test_set preparation; adding class labels to both sets
  classes.training = gsub("_.*","",rownames(zscores.primary.set))
  training.set = cbind(classes.training,zscores.primary.set[,1:mfw])
  classes.test = gsub("_.*","",rownames(zscores.secondary.set))
  test.set = cbind(classes.test,zscores.secondary.set[,1:mfw])
#  input.data = as.data.frame(rbind(training.set,test.set))
  #
  # number of nearest neighbors to be considered
  k.value = 1
  # classes that will be used for training the classifier (=classes of I set)
  classes = factor(training.set[,1])
  # training and classification
  classification.results = knn(training.set[,-1],test.set[,-1],classes,k=k.value)
  # cross-validation: 
  #knn.cv(training.set[,-1],classes,k=k.value,prob=T)
  classification.results = as.character(classification.results)
}


if(tolower(classification.method) == "naivebayes") {
  # Naive Bayes classification:
  library(e1071)
  #
  # training_set and test_set preparation; adding class labels to both sets
  training.set = primary.set[,1:mfw]
  test.set = secondary.set[,1:mfw]
  classes.training = gsub("_.*","",rownames(training.set))
  classes.test = gsub("_.*","",rownames(test.set))
  classes = c(classes.training, classes.test)
  input.data = as.data.frame(rbind(training.set,test.set))
  input.data = cbind(classes, input.data)
  training.classes = c(1:length(training.set[,1]))
  #
  # training a model
  model = naiveBayes(classes ~ ., data = input.data, subset = training.classes)
  #
  # testing the model on "new" data (i.e. the test.set)
  classification.results = predict(model, input.data[,-1])
  classification.results = as.character(classification.results)
  classification.results = classification.results[-c(1:length(classes.training))]
}



if(tolower(classification.method) == "svm") {
  # Support Vector Machines classification:
  library(e1071)
  #
  # training_set and test_set preparation; adding class labels to both sets
  training.set = primary.set[,1:mfw]
  test.set = secondary.set[,1:mfw]
  classes.training = gsub("_.*","",rownames(training.set))
  classes.test = gsub("_.*","",rownames(test.set))
  classes = c(classes.training, classes.test)
  input.data = as.data.frame(rbind(training.set,test.set))
  input.data = cbind(classes, input.data)
  training.classes = c(1:length(training.set[,1]))
  #
  # training a model
  model = svm(classes ~ ., data = input.data, subset = training.classes)
  #
  # testing the model on "new" data (i.e. the test.set)
  classification.results = predict(model, input.data[,-1])
  classification.results = as.character(classification.results)
  classification.results = classification.results[-c(1:length(classes.training))]
#plot(cmdscale(dist(input.data[,-1])),col=as.integer(input.data[,1]),pch=c("o","+"))
}



if(tolower(classification.method) == "nsc") {
  # Nearest Shrunken Centroid classification:
  library(pamr)
  #
  # training_set and test_set preparation; adding class labels to both sets
  training.set = primary.set[,1:mfw]
  test.set = secondary.set[,1:mfw]
  classes.training = gsub("_.*","",rownames(training.set))
  classes.test = gsub("_.*","",rownames(test.set))
  classes = c(classes.training, classes.test)
  input.data = as.data.frame(rbind(training.set,test.set))
  training.classes = c(1:length(training.set[,1]))
  mydata=list(x=t(input.data),y=as.factor(classes))
  # training a model
  model = pamr.train(mydata,sample.subset=c(1:length(classes.training)))
  # testing the model on "new" data (i.e. the test.set)
  classification.results = pamr.predict(model,mydata$x,threshold=1)
  classification.results = as.character(classification.results)
  classification.results = classification.results[-c(1:length(classes.training))]
}












# returns the ranking of the most likely candidates as a list
if(final.ranking.of.candidates == TRUE) {
    cat("\n\n\n",file=outputfile,append=T)
    if(tolower(classification.method) == "delta") {
      make.ranking.of.candidates(selected.dist,number.of.candidates)
    } else {
      misclassified.samples = 
                   paste(rownames(test.set), "\t-->\t",
                   classification.results)[classes.test!=classification.results]
      cat(misclassified.samples,file=outputfile,append=T,sep="\n")    
    }
}


# returns the number of correct attributions
if(how.many.correct.attributions == TRUE) {
    if(tolower(classification.method) == "delta") {
      no.of.correct.attrib = make.number.of.correct.attributions(selected.dist)
    } else {
      no.of.correct.attrib = sum(as.numeric(classes.test == 
                                 classification.results))
    }
    total.no.of.correct.attrib = 
         c(total.no.of.correct.attrib, no.of.correct.attrib)
    total.no.of.possible.attrib = 
         c(total.no.of.possible.attrib, perfect.guessing)
    cat("\n",file=outputfile,append=T)
    cat(mfw, " MFW , culled @ ",current.culling,"%,  ",
         no.of.correct.attrib," of ", perfect.guessing,"\t(",
         round(no.of.correct.attrib / perfect.guessing * 100, 1),"%)",
         "\n",file=outputfile,append=T,sep="")
}













}    # <-- the internal loop for(i) returns here
# #################################################

# blank line on the screen
cat("\n")


}    # <-- the main loop for(j) returns here
# #################################################



all.guesses = total.no.of.correct.attrib / total.no.of.possible.attrib * 100
total.no.of.correct.attrib = sum(total.no.of.correct.attrib)
total.no.of.possible.attrib = sum(total.no.of.possible.attrib)



# information about the current task into the logfile
cat("\nGeneral attributive success:  ", total.no.of.correct.attrib, " of ",
           total.no.of.possible.attrib, " (",
           round(total.no.of.correct.attrib/total.no.of.possible.attrib*100, 1), 
           "%)\n",file=outputfile,append=T,sep="")
cat("\nMFWs from ",mfw.min," to ",mfw.max.original,
                  " @ increment ",mfw.incr,"\nCulling from ",culling.min,
                  " to ",culling.max," @ increment ",culling.incr,
                  "\nPronouns deleted: ",delete.pronouns,"; ",
                  distance.name.on.graph,"\n",file=outputfile,append=T,sep="")
# additional empty line in outputfile (EOF)
cat("\n",file=outputfile,append=T)


# the same information (about the current task) on screen
cat("\nGeneral attributive success:  ", total.no.of.correct.attrib, " of ",
           total.no.of.possible.attrib, " (",
           round(total.no.of.correct.attrib/total.no.of.possible.attrib*100,1), 
           "%, sd =", round(sd(all.guesses),1),"%)\n")
cat("\nMFWs from ",mfw.min," to ",mfw.max.original,
                  " @ increment ",mfw.incr,"\nCulling from ",culling.min,
                  " to ",culling.max," @ increment ",culling.incr,
                  "\nPronouns deleted: ",delete.pronouns,"; ",
                  distance.name.on.graph,"\n",sep="")





# #################################################
# final cleaning



cat("\n")
cat("removing most of the variables... \n")
cat("type ls() if you want to see what was not removed\n")
cat("if you are going to change the corpus, clean all: rm(list=ls())\n")
cat("\n")
cat("Results saved in", outputfile, "\n")
cat("\n") 



# a list of variables not to be removed
do.not.remove = c("zscores.table.both.sets", "freq.table.both.sets",
                  "freq.I.set.0.culling", "freq.II.set.0.culling",
                  "distance.table","outputfile","all.guesses")

# removing the variables which are not on the above list
list.of.variables = ls()
#rm(list=list.of.variables[!(list.of.variables %in% do.not.remove)])




# #################################################