File size: 1,430 Bytes
0e5da39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/bin/sh

wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-5150/ud-treebanks-v2.12.tgz
tar xf ud-treebanks-v2.12.tgz
rm ud-treebanks-v2.12.tgz
for tb in ud-treebanks-v2.12/*/*-ud-train.conllu; do
  dir=`dirname $tb`
  long_name=`basename $dir`
  code=`basename $tb`
  code=${code%%-*}

  case $long_name in
    UD_Arabic-NYUAD|UD_English-GUMReddit|UD_Japanese-BCCWJ|UD_Japanese-BCCWJLUW) echo Skipping treebank $long_name without forms; continue;;
  esac

  mkdir -p "$code"
  cp "$dir"/* "$code"
  if [ -f "$dir"/"$code"-ud-dev.conllu ]; then
    cp "$dir"/"$code"-ud-train.conllu "$dir"/"$code"-ud-dev.conllu "$code"
  else
    perl conllu_split.pl "$code" "$code" <"$dir"/"$code"-ud-train.conllu
  fi

  train_words=$(grep -c '^[0-9]' $code/*-ud-train.conllu)
  [ "$train_words" -lt 1000 ] && { echo Skipping treebank $code with $train_words training words. >&2; rm -r "$code/"; continue; }
  echo $code $long_name | tee -a iso_names.txt

  for conllu in "$code"/*.conllu; do
    sed "1i# variant = $code" -i "$conllu"
    case "$conllu" in
      *train.conllu) rm "${conllu%.conllu}.txt";;
      *) perl conllu_to_text.pl --language="$code" <"$conllu" >"${conllu%.conllu}.txt";;
    esac
  done
done
rm -r ud-treebanks-v2.12

for code in $(for d in */; do echo ${d%%[_/]*}; done | sort | uniq -d); do
  mkdir "${code}_all"
  cat "$code"*/*train.conllu >"${code}_all/${code}_all-ud-train.conllu"
done