Options

The .Rmd source document can be configured by modifying the following lines:

analyze.results <- TRUE
packages.path <- 'packages'
Sys.setenv(CLG_AUTHORSHIP_PACKAGES_PATH = packages.path)
work.dir <- 'experiments/1-doc-size'
variable.name <- 'Size of documents'
Sys.setenv(EXPE_WORK_DIR = work.dir)
doc.sizes <- '20 40 60 80 100 120 140 160 180 200 300 400 500 600 700 800 900 1000'
Sys.setenv(EXPE1_DOC_SIZES = doc.sizes)
any.doc.size <- strsplit(doc.sizes, ' ',fixed=TRUE)[[1]][1]
set.seed(2022)

When reproducing the below experiments manually, one should initialize the environment variables, for instance:

export CLG_AUTHORSHIP_PACKAGES_PATH=packages
export EXPE_WORK_DIR=experiments/1-doc-size
export EXPE1_DOC_SIZES='20 40 60 80 100 120 140 160 180 200 300 400 500 600 700 800 900 1000'

Data generation

Dataset by document size

source session-setup.sh
[ -d "$EXPE_WORK_DIR" ] || mkdir "$EXPE_WORK_DIR"
for SIZE in $EXPE1_DOC_SIZES; do
  if [ ! -d "$EXPE_WORK_DIR/$SIZE" ]; then 
    echo "Creating data in dir $EXPE_WORK_DIR/$SIZE."
    mkdir "$EXPE_WORK_DIR/$SIZE"
    mkdir "$EXPE_WORK_DIR/$SIZE/data"
    find data/gb/* data/ia/* -maxdepth 0 -type f | grep -v '\.' | ./create-snippets-dataset.sh -r $SIZE "$EXPE_WORK_DIR/$SIZE/data"
  else
    echo "dir $EXPE_WORK_DIR/$SIZE already created."
  fi
done
## dir experiments/1-doc-size/20 already created.
## dir experiments/1-doc-size/40 already created.
## dir experiments/1-doc-size/60 already created.
## dir experiments/1-doc-size/80 already created.
## dir experiments/1-doc-size/100 already created.
## dir experiments/1-doc-size/120 already created.
## dir experiments/1-doc-size/140 already created.
## dir experiments/1-doc-size/160 already created.
## dir experiments/1-doc-size/180 already created.
## dir experiments/1-doc-size/200 already created.
## dir experiments/1-doc-size/300 already created.
## dir experiments/1-doc-size/400 already created.
## dir experiments/1-doc-size/500 already created.
## dir experiments/1-doc-size/600 already created.
## dir experiments/1-doc-size/700 already created.
## dir experiments/1-doc-size/800 already created.
## dir experiments/1-doc-size/900 already created.
## dir experiments/1-doc-size/1000 already created.

Training and test cases

d <- readDataDir(paste(work.dir,any.doc.size,'data',sep='/'))
dataSplitByAuthor <- splitAuthors(d)
## [1] "24  authors ( 22 with at least 2 books )"
## [1] "authors in train-only:  mh,wdh,sw,ga,amd,hm,tsa"
## [1] "authors in test-only:  fmc,haj,hj,espw,hbs,us,ab,ewaoc,mh+"
## [1] "authors in shared:  ew,cfw,cdw,lma,es,mt,rwc,wta"
## [1] "books from shared authors in the training set:  90"
## [1] "all books in the training set:  285"
## [1] "books from shared authors in the test set:  89"
## [1] "books NOT from shared authors in the test set:  180"
## [1] "all books in the test set:  269"
full <- buildFullDataset(dataSplitByAuthor, 100, 100)
## [1] "*** TRAIN SET"
## [1] "Cartesian product:  81225"
## [1] "Removing pairs with same book:  80940"
## [1] "pairs same author: 7608"
## [1] "pairs diff author: 73332"
## [1] "picking without replacement (a book can appear only once):"
## [1] " *** (1) same author"
## [1] " *** (2) different author"
## [1] "shuffling rows"
## [1] "*** TEST SET"
## [1] "Cartesian product:  72361"
## [1] "Removing pairs with same book:  72092"
## [1] "pairs same author: 6354"
## [1] "pairs diff author: 65738"
## [1] "picking without replacement (a book can appear only once):"
## [1] " *** (1) same author"
## [1] " *** (2) different author"
## [1] "shuffling rows"
## [1] "15 authors in the training set"
## [1] "proportion of 'author seen in training' in the test set: 0.43"
fwrite(full, paste(work.dir,'full-dataset.tsv',sep='/'), sep='\t')
saveDatasetInCasesFormat(full,dir=work.dir)

Adding truth file, impostors and meta-config files

  • Simply using all the training docs as impostors.
source session-setup.sh
for SIZE in $EXPE1_DOC_SIZES; do
  echo "$SIZE: truth file"
  cat "$EXPE_WORK_DIR/train.tsv" > "$EXPE_WORK_DIR/$SIZE/data/truth.txt"
  echo "$SIZE: impostors"
  if [ ! -d "$EXPE_WORK_DIR/$SIZE/impostors" ]; then
    mkdir "$EXPE_WORK_DIR/$SIZE/impostors"
    mkdir "$EXPE_WORK_DIR/$SIZE/impostors/my.impostors"
    cat "$EXPE_WORK_DIR/$SIZE/data/truth.txt" | cut -f 1 | tr ' :' '\n\n'  | sort -u | while read f; do 
      cp "$EXPE_WORK_DIR/$SIZE/data/$f" $EXPE_WORK_DIR/$SIZE/impostors/my.impostors/$f.txt
    done
  fi
  echo "$SIZE: multi-conf files"
  [ -d "$EXPE_WORK_DIR/$SIZE/process" ] || mkdir "$EXPE_WORK_DIR/$SIZE/process"
  generate-multi-conf.sh 1 "$EXPE_WORK_DIR/$SIZE/process"
  cat conf/meta-template.std.multi-conf > "$EXPE_WORK_DIR/$SIZE/process/meta-template.multi-conf"
done
## 20: truth file
## 20: impostors
## 20: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 40: truth file
## 40: impostors
## 40: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 60: truth file
## 60: impostors
## 60: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 80: truth file
## 80: impostors
## 80: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 100: truth file
## 100: impostors
## 100: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 120: truth file
## 120: impostors
## 120: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 140: truth file
## 140: impostors
## 140: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 160: truth file
## 160: impostors
## 160: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 180: truth file
## 180: impostors
## 180: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 200: truth file
## 200: impostors
## 200: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 300: truth file
## 300: impostors
## 300: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 400: truth file
## 400: impostors
## 400: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 500: truth file
## 500: impostors
## 500: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 600: truth file
## 600: impostors
## 600: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 700: truth file
## 700: impostors
## 700: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 800: truth file
## 800: impostors
## 800: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 900: truth file
## 900: impostors
## 900: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 1000: truth file
## 1000: impostors
## 1000: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)

Running the training processes

The script ./run.sh performs the full training process for one single “size” (variable value). It’s a simple script which prepares the data and then starts the training process, as described in the user guide (part 2). It is used as follows:

./run.sh $EXPE_WORK_DIR $SIZE $TASKS_DIR $NCORES

Evaluating

The script ./evaluate-all.sh evaluates:

It is used as follows:

./evaluate-all.sh $EXPE_WORK_DIR $NCORES $TASKS_DIR

Analysis

d<-readExperimentResults(work.dir)
g1 <- perfByModelType(d,x.label=variable.name)
g1
## Warning: Removed 7 rows containing missing values (geom_point).

g2 <- comparePerfsByEvalOn(d,diff.seen=FALSE,x.label=variable.name)
g2
## `geom_smooth()` using formula 'y ~ x'

g3 <- comparePerfsByEvalOn(d,diff.seen=TRUE,x.label=variable.name)
g3
## `geom_smooth()` using formula 'y ~ x'

g<-plot_grid(g1,g2,g3,labels=NULL,ncol=3)
## Warning: Removed 7 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
ggsave('graphs-expe1.pdf',g,width=30,height=8,unit='cm')

Below the same graphs are split by size for the sake of readability.

Size below 200 by steps of 20

d<-readExperimentResults(work.dir)
d <- d[variable<=200,]
g1 <- perfByModelType(d,x.label=variable.name)
g1
## Warning: Removed 3 rows containing missing values (geom_point).

g2 <- comparePerfsByEvalOn(d,diff.seen=FALSE,x.label=variable.name)
g2
## `geom_smooth()` using formula 'y ~ x'

g3 <- comparePerfsByEvalOn(d,diff.seen=TRUE,x.label=variable.name)
g3
## `geom_smooth()` using formula 'y ~ x'

g<-plot_grid(g1,g2,g3,labels=NULL,ncol=3)
## Warning: Removed 3 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
ggsave('graphs-expe1A.pdf',g,width=30,height=8,unit='cm')

Size between 100 and 1000 by steps of 100

d<-readExperimentResults(work.dir)
d <- d[variable>=200 | variable ==100,]
g1 <- perfByModelType(d,x.label=variable.name)
g1
## Warning: Removed 4 rows containing missing values (geom_point).

g2 <- comparePerfsByEvalOn(d,diff.seen=FALSE,x.label=variable.name)
g2
## `geom_smooth()` using formula 'y ~ x'

g3 <- comparePerfsByEvalOn(d,diff.seen=TRUE,x.label=variable.name)
g3
## `geom_smooth()` using formula 'y ~ x'

g<-plot_grid(g1,g2,g3,labels=NULL,ncol=3)
## Warning: Removed 4 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
ggsave('graphs-expe1B.pdf',g,width=30,height=8,unit='cm')