This document is part of the “CLG Authorship Experiments” repository:
The .Rmd
source document can be configured by modifying the following lines:
analyze.results <- TRUE
packages.path <- 'packages'
Sys.setenv(CLG_AUTHORSHIP_PACKAGES_PATH = packages.path)
work.dir <- 'experiments/1-doc-size'
variable.name <- 'Size of documents'
Sys.setenv(EXPE_WORK_DIR = work.dir)
doc.sizes <- '20 40 60 80 100 120 140 160 180 200 300 400 500 600 700 800 900 1000'
Sys.setenv(EXPE1_DOC_SIZES = doc.sizes)
any.doc.size <- strsplit(doc.sizes, ' ',fixed=TRUE)[[1]][1]
set.seed(2022)
When reproducing the below experiments manually, one should initialize the environment variables, for instance:
export CLG_AUTHORSHIP_PACKAGES_PATH=packages
export EXPE_WORK_DIR=experiments/1-doc-size
export EXPE1_DOC_SIZES='20 40 60 80 100 120 140 160 180 200 300 400 500 600 700 800 900 1000'
source session-setup.sh
[ -d "$EXPE_WORK_DIR" ] || mkdir "$EXPE_WORK_DIR"
for SIZE in $EXPE1_DOC_SIZES; do
if [ ! -d "$EXPE_WORK_DIR/$SIZE" ]; then
echo "Creating data in dir $EXPE_WORK_DIR/$SIZE."
mkdir "$EXPE_WORK_DIR/$SIZE"
mkdir "$EXPE_WORK_DIR/$SIZE/data"
find data/gb/* data/ia/* -maxdepth 0 -type f | grep -v '\.' | ./create-snippets-dataset.sh -r $SIZE "$EXPE_WORK_DIR/$SIZE/data"
else
echo "dir $EXPE_WORK_DIR/$SIZE already created."
fi
done
## dir experiments/1-doc-size/20 already created.
## dir experiments/1-doc-size/40 already created.
## dir experiments/1-doc-size/60 already created.
## dir experiments/1-doc-size/80 already created.
## dir experiments/1-doc-size/100 already created.
## dir experiments/1-doc-size/120 already created.
## dir experiments/1-doc-size/140 already created.
## dir experiments/1-doc-size/160 already created.
## dir experiments/1-doc-size/180 already created.
## dir experiments/1-doc-size/200 already created.
## dir experiments/1-doc-size/300 already created.
## dir experiments/1-doc-size/400 already created.
## dir experiments/1-doc-size/500 already created.
## dir experiments/1-doc-size/600 already created.
## dir experiments/1-doc-size/700 already created.
## dir experiments/1-doc-size/800 already created.
## dir experiments/1-doc-size/900 already created.
## dir experiments/1-doc-size/1000 already created.
d <- readDataDir(paste(work.dir,any.doc.size,'data',sep='/'))
dataSplitByAuthor <- splitAuthors(d)
## [1] "24 authors ( 22 with at least 2 books )"
## [1] "authors in train-only: mh,wdh,sw,ga,amd,hm,tsa"
## [1] "authors in test-only: fmc,haj,hj,espw,hbs,us,ab,ewaoc,mh+"
## [1] "authors in shared: ew,cfw,cdw,lma,es,mt,rwc,wta"
## [1] "books from shared authors in the training set: 90"
## [1] "all books in the training set: 285"
## [1] "books from shared authors in the test set: 89"
## [1] "books NOT from shared authors in the test set: 180"
## [1] "all books in the test set: 269"
full <- buildFullDataset(dataSplitByAuthor, 100, 100)
## [1] "*** TRAIN SET"
## [1] "Cartesian product: 81225"
## [1] "Removing pairs with same book: 80940"
## [1] "pairs same author: 7608"
## [1] "pairs diff author: 73332"
## [1] "picking without replacement (a book can appear only once):"
## [1] " *** (1) same author"
## [1] " *** (2) different author"
## [1] "shuffling rows"
## [1] "*** TEST SET"
## [1] "Cartesian product: 72361"
## [1] "Removing pairs with same book: 72092"
## [1] "pairs same author: 6354"
## [1] "pairs diff author: 65738"
## [1] "picking without replacement (a book can appear only once):"
## [1] " *** (1) same author"
## [1] " *** (2) different author"
## [1] "shuffling rows"
## [1] "15 authors in the training set"
## [1] "proportion of 'author seen in training' in the test set: 0.43"
fwrite(full, paste(work.dir,'full-dataset.tsv',sep='/'), sep='\t')
saveDatasetInCasesFormat(full,dir=work.dir)
source session-setup.sh
for SIZE in $EXPE1_DOC_SIZES; do
echo "$SIZE: truth file"
cat "$EXPE_WORK_DIR/train.tsv" > "$EXPE_WORK_DIR/$SIZE/data/truth.txt"
echo "$SIZE: impostors"
if [ ! -d "$EXPE_WORK_DIR/$SIZE/impostors" ]; then
mkdir "$EXPE_WORK_DIR/$SIZE/impostors"
mkdir "$EXPE_WORK_DIR/$SIZE/impostors/my.impostors"
cat "$EXPE_WORK_DIR/$SIZE/data/truth.txt" | cut -f 1 | tr ' :' '\n\n' | sort -u | while read f; do
cp "$EXPE_WORK_DIR/$SIZE/data/$f" $EXPE_WORK_DIR/$SIZE/impostors/my.impostors/$f.txt
done
fi
echo "$SIZE: multi-conf files"
[ -d "$EXPE_WORK_DIR/$SIZE/process" ] || mkdir "$EXPE_WORK_DIR/$SIZE/process"
generate-multi-conf.sh 1 "$EXPE_WORK_DIR/$SIZE/process"
cat conf/meta-template.std.multi-conf > "$EXPE_WORK_DIR/$SIZE/process/meta-template.multi-conf"
done
## 20: truth file
## 20: impostors
## 20: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 40: truth file
## 40: impostors
## 40: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 60: truth file
## 60: impostors
## 60: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 80: truth file
## 80: impostors
## 80: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 100: truth file
## 100: impostors
## 100: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 120: truth file
## 120: impostors
## 120: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 140: truth file
## 140: impostors
## 140: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 160: truth file
## 160: impostors
## 160: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 180: truth file
## 180: impostors
## 180: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 200: truth file
## 200: impostors
## 200: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 300: truth file
## 300: impostors
## 300: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 400: truth file
## 400: impostors
## 400: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 500: truth file
## 500: impostors
## 500: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 600: truth file
## 600: impostors
## 600: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 700: truth file
## 700: impostors
## 700: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 800: truth file
## 800: impostors
## 800: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 900: truth file
## 900: impostors
## 900: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 1000: truth file
## 1000: impostors
## 1000: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
The script ./run.sh
performs the full training process for one single “size” (variable value). It’s a simple script which prepares the data and then starts the training process, as described in the user guide (part 2). It is used as follows:
./run.sh $EXPE_WORK_DIR $SIZE $TASKS_DIR $NCORES
$SIZE
.The script ./evaluate-all.sh
evaluates:
It is used as follows:
./evaluate-all.sh $EXPE_WORK_DIR $NCORES $TASKS_DIR
$EXPE_WORK_DIR/results
which contains the detailed output for every evaluated model.
$EXPE_WORK_DIR/results/results.tsv
.d<-readExperimentResults(work.dir)
g1 <- perfByModelType(d,x.label=variable.name)
g1
## Warning: Removed 7 rows containing missing values (geom_point).
g2 <- comparePerfsByEvalOn(d,diff.seen=FALSE,x.label=variable.name)
g2
## `geom_smooth()` using formula 'y ~ x'
g3 <- comparePerfsByEvalOn(d,diff.seen=TRUE,x.label=variable.name)
g3
## `geom_smooth()` using formula 'y ~ x'
g<-plot_grid(g1,g2,g3,labels=NULL,ncol=3)
## Warning: Removed 7 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
ggsave('graphs-expe1.pdf',g,width=30,height=8,unit='cm')
Below the same graphs are split by size for the sake of readability.
d<-readExperimentResults(work.dir)
d <- d[variable<=200,]
g1 <- perfByModelType(d,x.label=variable.name)
g1
## Warning: Removed 3 rows containing missing values (geom_point).
g2 <- comparePerfsByEvalOn(d,diff.seen=FALSE,x.label=variable.name)
g2
## `geom_smooth()` using formula 'y ~ x'
g3 <- comparePerfsByEvalOn(d,diff.seen=TRUE,x.label=variable.name)
g3
## `geom_smooth()` using formula 'y ~ x'
g<-plot_grid(g1,g2,g3,labels=NULL,ncol=3)
## Warning: Removed 3 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
ggsave('graphs-expe1A.pdf',g,width=30,height=8,unit='cm')
d<-readExperimentResults(work.dir)
d <- d[variable>=200 | variable ==100,]
g1 <- perfByModelType(d,x.label=variable.name)
g1
## Warning: Removed 4 rows containing missing values (geom_point).
g2 <- comparePerfsByEvalOn(d,diff.seen=FALSE,x.label=variable.name)
g2
## `geom_smooth()` using formula 'y ~ x'
g3 <- comparePerfsByEvalOn(d,diff.seen=TRUE,x.label=variable.name)
g3
## `geom_smooth()` using formula 'y ~ x'
g<-plot_grid(g1,g2,g3,labels=NULL,ncol=3)
## Warning: Removed 4 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
ggsave('graphs-expe1B.pdf',g,width=30,height=8,unit='cm')