Options

The .Rmd source document can be configured by modifying the following lines:

analyze.results <- TRUE
packages.path <- 'packages'
Sys.setenv(CLG_AUTHORSHIP_PACKAGES_PATH = packages.path)
work.dir <- 'experiments/4-author-diversity'
variable.name <- 'Number of authors in the training set'
Sys.setenv(EXPE_WORK_DIR = work.dir)
Sys.setenv(COPY_EXPE1_DATA_DIR = 'experiments/1-doc-size/100')
training.cases <- '2 3 4 5 6 7 8 9 10 11 12'
Sys.setenv(TRAINING_CASES = training.cases)
any.training.cases <- strsplit(training.cases, ' ',fixed=TRUE)[[1]][1]
set.seed(2022)
export CLG_AUTHORSHIP_PACKAGES_PATH=packages
export EXPE_WORK_DIR=experiments/4-author-diversity
export COPY_EXPE1_DATA_DIR=experiments/1-doc-size/100
export TRAINING_CASES='2 3 4 5 6 7 8 9 10 11 12'

Data generation

Dataset

We use the same dataset as in the first experiment (must have been calculated before).

source session-setup.sh
if [ ! -d "$COPY_EXPE1_DATA_DIR" ]; then
  echo "Dir '$COPY_EXPE1_DATA_DIR' not found" 1>&2
  exit 1
fi
if [ ! -d "$EXPE_WORK_DIR" ]; then 
  mkdir "$EXPE_WORK_DIR"
  for SIZE in $TRAINING_CASES; do
     mkdir "$EXPE_WORK_DIR/$SIZE"
     cp -R "$COPY_EXPE1_DATA_DIR"/process "$COPY_EXPE1_DATA_DIR"/data "$COPY_EXPE1_DATA_DIR"/impostors "$EXPE_WORK_DIR/$SIZE"
     rm -f "$EXPE_WORK_DIR/$SIZE/data/truth.txt"
  done
fi

Training and test cases

d <- readDataDir(paste(work.dir,any.training.cases,'data',sep='/'))
dataSplitByAuthor <- splitAuthors(d)
## [1] "24  authors ( 22 with at least 2 books )"
## [1] "authors in train-only:  mh,wdh,sw,ga,amd,hm,tsa"
## [1] "authors in test-only:  fmc,haj,hj,espw,hbs,us,ab,ewaoc,mh+"
## [1] "authors in shared:  ew,cfw,cdw,lma,es,mt,rwc,wta"
## [1] "books from shared authors in the training set:  90"
## [1] "all books in the training set:  285"
## [1] "books from shared authors in the test set:  89"
## [1] "books NOT from shared authors in the test set:  180"
## [1] "all books in the test set:  269"
full <- buildFullDataset(dataSplitByAuthor, 100, 100)
## [1] "*** TRAIN SET"
## [1] "Cartesian product:  81225"
## [1] "Removing pairs with same book:  80940"
## [1] "pairs same author: 7608"
## [1] "pairs diff author: 73332"
## [1] "picking without replacement (a book can appear only once):"
## [1] " *** (1) same author"
## [1] " *** (2) different author"
## [1] "shuffling rows"
## [1] "*** TEST SET"
## [1] "Cartesian product:  72361"
## [1] "Removing pairs with same book:  72092"
## [1] "pairs same author: 6354"
## [1] "pairs diff author: 65738"
## [1] "picking without replacement (a book can appear only once):"
## [1] " *** (1) same author"
## [1] " *** (2) different author"
## [1] "shuffling rows"
## [1] "15 authors in the training set"
## [1] "proportion of 'author seen in training' in the test set: 0.43"
l<-restrictTrainingSetDiversity(full)
## [1] "#### Generating training set with 2 authors. Nb books: 62"
## [1] "2 authors in the training set"
## [1] "proportion of 'author seen in training' in the test set: 0"
## [1] "#### Generating training set with 3 authors. Nb books: 83"
## [1] "3 authors in the training set"
## [1] "proportion of 'author seen in training' in the test set: 0"
## [1] "#### Generating training set with 4 authors. Nb books: 104"
## [1] "4 authors in the training set"
## [1] "proportion of 'author seen in training' in the test set: 0.13"
## [1] "#### Generating training set with 5 authors. Nb books: 125"
## [1] "5 authors in the training set"
## [1] "proportion of 'author seen in training' in the test set: 0.13"
## [1] "#### Generating training set with 6 authors. Nb books: 140"
## [1] "6 authors in the training set"
## [1] "proportion of 'author seen in training' in the test set: 0.22"
## [1] "#### Generating training set with 7 authors. Nb books: 154"
## [1] "7 authors in the training set"
## [1] "proportion of 'author seen in training' in the test set: 0.22"
## [1] "#### Generating training set with 8 authors. Nb books: 164"
## [1] "8 authors in the training set"
## [1] "proportion of 'author seen in training' in the test set: 0.26"
## [1] "#### Generating training set with 9 authors. Nb books: 173"
## [1] "9 authors in the training set"
## [1] "proportion of 'author seen in training' in the test set: 0.26"
## [1] "#### Generating training set with 10 authors. Nb books: 182"
## [1] "10 authors in the training set"
## [1] "proportion of 'author seen in training' in the test set: 0.26"
## [1] "#### Generating training set with 11 authors. Nb books: 188"
## [1] "11 authors in the training set"
## [1] "proportion of 'author seen in training' in the test set: 0.3"
## [1] "#### Generating training set with 12 authors. Nb books: 193"
## [1] "12 authors in the training set"
## [1] "proportion of 'author seen in training' in the test set: 0.32"
## [1] "#### Generating training set with 13 authors. Nb books: 196"
## [1] "13 authors in the training set"
## [1] "proportion of 'author seen in training' in the test set: 0.37"
## [1] "#### Generating training set with 14 authors. Nb books: 199"
## [1] "13 authors in the training set"
## [1] "proportion of 'author seen in training' in the test set: 0.37"
## [1] "#### Generating training set with 15 authors. Nb books: 200"
## [1] "15 authors in the training set"
## [1] "proportion of 'author seen in training' in the test set: 0.43"
for (size.str in strsplit(training.cases, ' ',fixed=TRUE)[[1]]) {
  size <- as.numeric(size.str)
  d <- l[[size]]
  fwrite(d, paste(work.dir,size.str,'full-dataset.tsv',sep='/'), sep='\t')
  saveDatasetInCasesFormat(d,dir=paste(work.dir,as.character(size),sep='/'))
}

Adding truth file

source session-setup.sh
for SIZE in $TRAINING_CASES; do
  echo "$SIZE: truth file"
  cat "$EXPE_WORK_DIR/$SIZE/train.tsv" > "$EXPE_WORK_DIR/$SIZE/data/truth.txt"
done
## 2: truth file
## 3: truth file
## 4: truth file
## 5: truth file
## 6: truth file
## 7: truth file
## 8: truth file
## 9: truth file
## 10: truth file
## 11: truth file
## 12: truth file

Running the training processes

The script ./run.sh performs the full training process for one single “size” (variable value). It’s a simple script which prepares the data and then starts the training process, as described in the user guide (part 2). It is used as follows:

./run.sh $EXPE_WORK_DIR $SIZE $TASKS_DIR $NCORES

Evaluating

The script ./evaluate-all.sh evaluates:

It is used as follows:

./evaluate-all.sh $EXPE_WORK_DIR $NCORES $TASKS_DIR

Analysis

d<-readExperimentResults(work.dir)
g1 <- perfByModelType(d,x.label=variable.name)
g1
## Warning: Removed 2 rows containing missing values (geom_point).

g2 <- comparePerfsByEvalOn(d,diff.seen=FALSE,x.label=variable.name)
g2
## `geom_smooth()` using formula 'y ~ x'

g3 <- comparePerfsByEvalOn(d,diff.seen=TRUE,x.label=variable.name)
g3
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 row(s) containing missing values (geom_path).

g<-plot_grid(g1,g2,g3,labels=NULL,ncol=3)
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 row(s) containing missing values (geom_path).
ggsave('graphs-expe4.pdf',g,width=30,height=8,unit='cm')