Links

This document is part of the “CLG Authorship Experiments” repository:

Options

The .Rmd source document can be configured by modifying the following lines:

analyze.results <- TRUE
packages.path <- 'packages'
Sys.setenv(CLG_AUTHORSHIP_PACKAGES_PATH = packages.path)
work.dir <- 'experiments/1-doc-size'
variable.name <- 'Size of documents'
Sys.setenv(EXPE_WORK_DIR = work.dir)
doc.sizes <- '20 40 60 80 100 120 140 160 180 200 300 400 500 600 700 800 900 1000'
Sys.setenv(EXPE1_DOC_SIZES = doc.sizes)
any.doc.size <- strsplit(doc.sizes, ' ',fixed=TRUE)[[1]][1]
set.seed(2022)

When reproducing the below experiments manually, one should initialize the environment variables, for instance:

export CLG_AUTHORSHIP_PACKAGES_PATH=packages
export EXPE_WORK_DIR=experiments/1-doc-size
export EXPE1_DOC_SIZES='20 40 60 80 100 120 140 160 180 200 300 400 500 600 700 800 900 1000'

Data generation

Dataset by document size

source session-setup.sh
[ -d "$EXPE_WORK_DIR" ] || mkdir "$EXPE_WORK_DIR"
for SIZE in $EXPE1_DOC_SIZES; do
  if [ ! -d "$EXPE_WORK_DIR/$SIZE" ]; then 
    echo "Creating data in dir $EXPE_WORK_DIR/$SIZE."
    mkdir "$EXPE_WORK_DIR/$SIZE"
    mkdir "$EXPE_WORK_DIR/$SIZE/data"
    find data/gb/* data/ia/* -maxdepth 0 -type f | grep -v '\.' | ./create-snippets-dataset.sh -r $SIZE "$EXPE_WORK_DIR/$SIZE/data"
  else
    echo "dir $EXPE_WORK_DIR/$SIZE already created."
  fi
done

## dir experiments/1-doc-size/20 already created.
## dir experiments/1-doc-size/40 already created.
## dir experiments/1-doc-size/60 already created.
## dir experiments/1-doc-size/80 already created.
## dir experiments/1-doc-size/100 already created.
## dir experiments/1-doc-size/120 already created.
## dir experiments/1-doc-size/140 already created.
## dir experiments/1-doc-size/160 already created.
## dir experiments/1-doc-size/180 already created.
## dir experiments/1-doc-size/200 already created.
## dir experiments/1-doc-size/300 already created.
## dir experiments/1-doc-size/400 already created.
## dir experiments/1-doc-size/500 already created.
## dir experiments/1-doc-size/600 already created.
## dir experiments/1-doc-size/700 already created.
## dir experiments/1-doc-size/800 already created.
## dir experiments/1-doc-size/900 already created.
## dir experiments/1-doc-size/1000 already created.

Training and test cases

d <- readDataDir(paste(work.dir,any.doc.size,'data',sep='/'))
dataSplitByAuthor <- splitAuthors(d)

## [1] "24  authors ( 22 with at least 2 books )"
## [1] "authors in train-only:  mh,wdh,sw,ga,amd,hm,tsa"
## [1] "authors in test-only:  fmc,haj,hj,espw,hbs,us,ab,ewaoc,mh+"
## [1] "authors in shared:  ew,cfw,cdw,lma,es,mt,rwc,wta"
## [1] "books from shared authors in the training set:  90"
## [1] "all books in the training set:  285"
## [1] "books from shared authors in the test set:  89"
## [1] "books NOT from shared authors in the test set:  180"
## [1] "all books in the test set:  269"

full <- buildFullDataset(dataSplitByAuthor, 100, 100)

## [1] "*** TRAIN SET"
## [1] "Cartesian product:  81225"
## [1] "Removing pairs with same book:  80940"
## [1] "pairs same author: 7608"
## [1] "pairs diff author: 73332"
## [1] "picking without replacement (a book can appear only once):"
## [1] " *** (1) same author"
## [1] " *** (2) different author"
## [1] "shuffling rows"
## [1] "*** TEST SET"
## [1] "Cartesian product:  72361"
## [1] "Removing pairs with same book:  72092"
## [1] "pairs same author: 6354"
## [1] "pairs diff author: 65738"
## [1] "picking without replacement (a book can appear only once):"
## [1] " *** (1) same author"
## [1] " *** (2) different author"
## [1] "shuffling rows"
## [1] "15 authors in the training set"
## [1] "proportion of 'author seen in training' in the test set: 0.43"

fwrite(full, paste(work.dir,'full-dataset.tsv',sep='/'), sep='\t')
saveDatasetInCasesFormat(full,dir=work.dir)

Adding truth file, impostors and meta-config files

Simply using all the training docs as impostors.

source session-setup.sh
for SIZE in $EXPE1_DOC_SIZES; do
  echo "$SIZE: truth file"
  cat "$EXPE_WORK_DIR/train.tsv" > "$EXPE_WORK_DIR/$SIZE/data/truth.txt"
  echo "$SIZE: impostors"
  if [ ! -d "$EXPE_WORK_DIR/$SIZE/impostors" ]; then
    mkdir "$EXPE_WORK_DIR/$SIZE/impostors"
    mkdir "$EXPE_WORK_DIR/$SIZE/impostors/my.impostors"
    cat "$EXPE_WORK_DIR/$SIZE/data/truth.txt" | cut -f 1 | tr ' :' '\n\n'  | sort -u | while read f; do 
      cp "$EXPE_WORK_DIR/$SIZE/data/$f" $EXPE_WORK_DIR/$SIZE/impostors/my.impostors/$f.txt
    done
  fi
  echo "$SIZE: multi-conf files"
  [ -d "$EXPE_WORK_DIR/$SIZE/process" ] || mkdir "$EXPE_WORK_DIR/$SIZE/process"
  generate-multi-conf.sh 1 "$EXPE_WORK_DIR/$SIZE/process"
  cat conf/meta-template.std.multi-conf > "$EXPE_WORK_DIR/$SIZE/process/meta-template.multi-conf"
done

## 20: truth file
## 20: impostors
## 20: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 40: truth file
## 40: impostors
## 40: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 60: truth file
## 60: impostors
## 60: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 80: truth file
## 80: impostors
## 80: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 100: truth file
## 100: impostors
## 100: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 120: truth file
## 120: impostors
## 120: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 140: truth file
## 140: impostors
## 140: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 160: truth file
## 160: impostors
## 160: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 180: truth file
## 180: impostors
## 180: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 200: truth file
## 200: impostors
## 200: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 300: truth file
## 300: impostors
## 300: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 400: truth file
## 400: impostors
## 400: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 500: truth file
## 500: impostors
## 500: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 600: truth file
## 600: impostors
## 600: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 700: truth file
## 700: impostors
## 700: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 800: truth file
## 800: impostors
## 800: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 900: truth file
## 900: impostors
## 900: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)
## 1000: truth file
## 1000: impostors
## 1000: multi-conf files
## generate-multi-conf.sh: Strategy 'basic' (with POS)
## generate-multi-conf.sh: Strategy 'GI' (with POS)
## generate-multi-conf.sh: Strategy 'univ' (with POS)

Running the training processes

The script ./run.sh performs the full training process for one single “size” (variable value). It’s a simple script which prepares the data and then starts the training process, as described in the user guide (part 2). It is used as follows:

./run.sh $EXPE_WORK_DIR $SIZE $TASKS_DIR $NCORES

Naturally the training process must be run for every value $SIZE.
Caution: A single process takes between 1 and 3 days using 40 cores.

Evaluating

The script ./evaluate-all.sh evaluates:

for every “size” (variables values),
the top model (according to the training) for every of the four “model types” (basic, GI, univ, meta),
on both the training and test set,
and calculates the “author seen/unseen” performance values.

It is used as follows:

./evaluate-all.sh $EXPE_WORK_DIR $NCORES $TASKS_DIR

The evaluation process is also resource-intensive, it takes up to 3 hours with 40 cores, depending on the number of values.
The process creates the directory $EXPE_WORK_DIR/results which contains the detailed output for every evaluated model.
- The main output is contained in $EXPE_WORK_DIR/results/results.tsv.

Analysis

d<-readExperimentResults(work.dir)

g1 <- perfByModelType(d,x.label=variable.name)
g1

## Warning: Removed 7 rows containing missing values (geom_point).

g2 <- comparePerfsByEvalOn(d,diff.seen=FALSE,x.label=variable.name)
g2

## `geom_smooth()` using formula 'y ~ x'

g3 <- comparePerfsByEvalOn(d,diff.seen=TRUE,x.label=variable.name)
g3

## `geom_smooth()` using formula 'y ~ x'

g<-plot_grid(g1,g2,g3,labels=NULL,ncol=3)

## Warning: Removed 7 rows containing missing values (geom_point).

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

ggsave('graphs-expe1.pdf',g,width=30,height=8,unit='cm')

Below the same graphs are split by size for the sake of readability.

Size below 200 by steps of 20

d<-readExperimentResults(work.dir)
d <- d[variable<=200,]

g1 <- perfByModelType(d,x.label=variable.name)
g1

## Warning: Removed 3 rows containing missing values (geom_point).

g2 <- comparePerfsByEvalOn(d,diff.seen=FALSE,x.label=variable.name)
g2

## `geom_smooth()` using formula 'y ~ x'

g3 <- comparePerfsByEvalOn(d,diff.seen=TRUE,x.label=variable.name)
g3

## `geom_smooth()` using formula 'y ~ x'

g<-plot_grid(g1,g2,g3,labels=NULL,ncol=3)

## Warning: Removed 3 rows containing missing values (geom_point).

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

ggsave('graphs-expe1A.pdf',g,width=30,height=8,unit='cm')

Size between 100 and 1000 by steps of 100

d<-readExperimentResults(work.dir)
d <- d[variable>=200 | variable ==100,]

g1 <- perfByModelType(d,x.label=variable.name)
g1

## Warning: Removed 4 rows containing missing values (geom_point).

g2 <- comparePerfsByEvalOn(d,diff.seen=FALSE,x.label=variable.name)
g2

## `geom_smooth()` using formula 'y ~ x'

g3 <- comparePerfsByEvalOn(d,diff.seen=TRUE,x.label=variable.name)
g3

## `geom_smooth()` using formula 'y ~ x'

g<-plot_grid(g1,g2,g3,labels=NULL,ncol=3)

## Warning: Removed 4 rows containing missing values (geom_point).

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

ggsave('graphs-expe1B.pdf',g,width=30,height=8,unit='cm')

Experiment 1: size of the documents

Erwan Moreau

2/9/2022