File size: 1,527 Bytes
2890e34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import logging
from collections import defaultdict
from typing import *
import os

from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.instance import Instance
from concrete import SituationMention
from concrete.util import CommunicationReader

from .span_reader import SpanReader
from .srl_reader import SRLDatasetReader
from .concrete_srl import collect_concrete_srl
from ..utils import Span, BIOSmoothing

logger = logging.getLogger(__name__)


@DatasetReader.register('concrete')
class ConcreteDatasetReader(SRLDatasetReader):
    def __init__(
            self,
            event_only: bool = False,
            event_smoothing_factor: float = 0.,
            arg_smoothing_factor: float = 0.,
            **extra
    ):
        super().__init__(**extra)
        self.event_only = event_only
        self.event_only = event_only
        self.event_smooth_factor = event_smoothing_factor
        self.arg_smooth_factor = arg_smoothing_factor

    def _read(self, file_path: str) -> Iterable[Instance]:
        if os.path.isdir(file_path):
            for fn in os.listdir(file_path):
                yield from self._read(os.path.join(file_path, fn))
        all_files = CommunicationReader(file_path)
        for comm, fn in all_files:
            sentences = collect_concrete_srl(comm)
            for tokens, vr in sentences:
                yield self.text_to_instance(tokens, vr)
        logger.warning(f'{self.n_span_removed} spans were removed')
        self.n_span_removed = 0