Source code for pylangacq.objects

import dataclasses
from typing import Dict, List, Tuple, Union

from tabulate import tabulate

from ._punctuation_marks import _PUNCTUATION_MARKS


_POSTCLITIC = "POSTCLITIC"
_PRECLITIC = "PRECLITIC"
_CLITICS = frozenset([_PRECLITIC, _POSTCLITIC])


[docs]@dataclasses.dataclass
class Gra:
    """Grammatical relation of a word in an utterance.

    Attributes
    ----------
    dep : int
        The position of the dependent (i.e., the word itself) in the utterance
    head : int
        The position of the head in the utterance
    rel : str
        Grammatical relation
    """

    __slots__ = ("dep", "head", "rel")

    dep: int
    head: int
    rel: str


[docs]@dataclasses.dataclass
class Token:
    """Token with attributes as parsed from a CHAT utterance.

    Attributes
    ----------
    word : str
        Word form of the token
    pos : str
        Part-of-speech tag
    mor : str
        Morphological information
    gra : Gra
        Grammatical relation
    """

    __slots__ = ("word", "pos", "mor", "gra")

    word: str
    pos: Union[str, None]
    mor: Union[str, None]
    gra: Union[Gra, None]

    def to_mor_tier(self) -> str:
        """Return the %mor representation.

        Returns
        -------
        str
        """
        if self.word in _PUNCTUATION_MARKS:
            return self.word
        else:
            return f"{self.pos or ''}|{self.mor or ''}"

    def to_gra_tier(self) -> str:
        """Return the %gra representation.

        Returns
        -------
        str
        """
        return f"{self.gra.dep}|{self.gra.head}|{self.gra.rel}"


[docs]@dataclasses.dataclass
class Utterance:
    """Utterance in a CHAT transcript data.

    Attributes
    ----------
    participant : str
        Participant of the utterance, e.g., ``"CHI"``, ``"MOT"``
    tokens : List[Token]
        List of tokens of the utterance
    time_marks : Tuple[int, int]
        If available from the CHAT data, these are the start and end times
        (in milliseconds) for a segment in a digitized video or audio file,
        e.g., ``(0, 1073)``, extracted from ``"·0_1073·"`` in the CHAT data.
        ``"·"`` is ASCII code 21 (0x15), for NAK (Negative Acknowledgment).
    tiers : Dict[str, str]
        This dictionary contains all the original, unparsed data from the utterance,
        including the transcribed utterance (signaled by ``*CHI:``, ``*MOT:`` etc
        in CHAT), common tiers such as %mor and %gra, as well as all other tiers
        associated with the utterance. This dictionary is useful to retrieve
        whatever information not readily handled by this package.
    """

    __slots__ = ("participant", "tokens", "time_marks", "tiers")

    participant: str
    tokens: List[Token]
    time_marks: Union[Tuple[int, int], None]
    tiers: Dict[str, str]

    def _to_str(self, tabular: bool = True) -> str:
        # `mor_gra_keys` needs to be a list for the ordering.
        mor_gra_keys = [key for key in ("%mor", "%gra") if key in self.tiers.keys()]
        if tabular and mor_gra_keys:
            tokens_in_table = []
            prev_token = None
            for token in self.tokens:
                token_in_table = []
                # TODO: Write a test for the clitic case.
                if token.word == _POSTCLITIC and prev_token is not None:
                    tokens_in_table.pop()
                    token_in_table.append(prev_token.word)
                    if "%mor" in mor_gra_keys:
                        token_in_table.append(
                            f"{prev_token.to_mor_tier()}~{token.to_mor_tier()}"
                        )
                    if "%gra" in mor_gra_keys:
                        token_in_table.append(
                            f"{prev_token.to_gra_tier()} {token.to_gra_tier()}"
                        )
                else:
                    token_in_table.append(token.word)
                    if "%mor" in mor_gra_keys:
                        token_in_table.append(token.to_mor_tier())
                    if "%gra" in mor_gra_keys:
                        token_in_table.append(token.to_gra_tier())
                prev_token = token
                tokens_in_table.append(token_in_table)
            tokens_in_table_with_keys = [
                [f"*{self.participant}:"] + [f"{key}:" for key in mor_gra_keys],
                *tokens_in_table,
            ]
            # Transpose (see https://stackoverflow.com/a/6473724)
            tiers_in_table = list(map(list, zip(*tokens_in_table_with_keys)))
            str_for_u = f"{tabulate(tiers_in_table, tablefmt='plain')}\n"
        else:
            str_for_u = f"*{self.participant}:\t{self.tiers[self.participant]}\n"
            for key in mor_gra_keys:
                str_for_u += f"{key}:\t{self.tiers[key]}\n"

        keys = _sort_keys(self.tiers.keys(), drop={self.participant, "%mor", "%gra"})
        for key in keys:
            str_for_u += f"{key}:\t{self.tiers[key]}\n"

        return str_for_u

    def _repr_html_(self):
        html = ""

        # Row from words
        cells = [
            f'    <td style="text-align: left">{t.word}</td>\n' for t in self.tokens
        ]
        html += (
            "  <tr>\n"
            f"    <td>*{self.participant}:</td>\n"
            f"{''.join(cells)}"
            "  </tr>\n"
        )

        # Row from %mor
        if "%mor" in self.tiers:
            cells = [
                f'    <td style="text-align: left">{t.to_mor_tier()}</td>\n'
                for t in self.tokens
            ]
            html += "  <tr>\n" "    <td>%mor:</td>\n" f"{''.join(cells)}" "  </tr>\n"

        # Row from %gra
        if "%gra" in self.tiers:
            cells = [
                f'    <td style="text-align: left">{t.to_gra_tier()}</td>\n'
                for t in self.tokens
            ]
            html += "  <tr>\n" "    <td>%gra:</td>\n" f"{''.join(cells)}" "  </tr>\n"

        keys = _sort_keys(self.tiers.keys(), drop={self.participant, "%mor", "%gra"})
        for key in keys:
            html += (
                f"  <tr>\n"
                f"    <td>{key}:</td>\n"
                f'    <td colspan="{len(self.tokens)}" style="text-align: left">'
                f"{self.tiers[key]}</td>\n"
                f"  </tr>\n"
            )

        return f"<table>{html}</table>"


def _sort_keys(keys, *, first=None, drop=None) -> List[str]:
    sorted_keys = []
    first = first or []
    drop = set(drop or [])  # ordering doesn't matter
    for key in first:
        if key in keys:
            sorted_keys.append(key)
    for key in keys:
        if not (key in sorted_keys or key in drop):
            sorted_keys.append(key)
    return sorted_keys