Source code for pylangacq.objects

import dataclasses
from typing import Dict, List, Tuple, Union

from tabulate import tabulate

from pylangacq._punctuation_marks import _PUNCTUATION_MARKS


_CLITIC = "CLITIC"


[docs]@dataclasses.dataclass class Gra: """Grammatical relation of a word in an utterance. Attributes ---------- dep : int The position of the dependent (i.e., the word itself) in the utterance head : int The position of the head in the utterance rel : str Grammatical relation """ __slots__ = ("dep", "head", "rel") dep: int head: int rel: str
[docs]@dataclasses.dataclass class Token: """Token with attributes as parsed from a CHAT utterance. Attributes ---------- word : str Word form of the token pos : str Part-of-speech tag mor : str Morphological information gra : Gra Grammatical relation """ __slots__ = ("word", "pos", "mor", "gra") word: str pos: Union[str, None] mor: Union[str, None] gra: Union[Gra, None] def to_mor_tier(self) -> str: """Return the %mor representation. Returns ------- str """ if self.word in _PUNCTUATION_MARKS: return self.word else: return f"{self.pos or ''}|{self.mor or ''}" def to_gra_tier(self) -> str: """Return the %gra representation. Returns ------- str """ return f"{self.gra.dep}|{self.gra.head}|{self.gra.rel}"
[docs]@dataclasses.dataclass class Utterance: """Utterance in a CHAT transcript data. Attributes ---------- participant : str Participant of the utterance, e.g., ``"CHI"``, ``"MOT"`` tokens : List[Token] List of tokens of the utterance time_marks : Tuple[int, int] If available from the CHAT data, these are the start and end times (in milliseconds) for a segment in a digitized video or audio file, e.g., ``(0, 1073)``, extracted from ``"·0_1073·"`` in the CHAT data. ``"·"`` is ASCII code 21 (0x15), for NAK (Negative Acknowledgment). tiers : Dict[str, str] This dictionary contains all the original, unparsed data from the utterance, including the transcribed utterance (signaled by ``*CHI:``, ``*MOT:`` etc in CHAT), common tiers such as %mor and %gra, as well as all other tiers associated with the utterance. This dictionary is useful to retrieve whatever information not readily handled by this package. """ __slots__ = ("participant", "tokens", "time_marks", "tiers") participant: str tokens: List[Token] time_marks: Union[Tuple[int, int], None] tiers: Dict[str, str] def _to_str(self, tabular: bool = True) -> str: # `mor_gra_keys` needs to be a list for the ordering. mor_gra_keys = [key for key in ("%mor", "%gra") if key in self.tiers.keys()] if tabular and mor_gra_keys: tokens_in_table = [] prev_token = None for token in self.tokens: token_in_table = [] # TODO: Write a test for the clitic case. if token.word == _CLITIC and prev_token is not None: tokens_in_table.pop() token_in_table.append(prev_token.word) if "%mor" in mor_gra_keys: token_in_table.append( f"{prev_token.to_mor_tier()}~{token.to_mor_tier()}" ) if "%gra" in mor_gra_keys: token_in_table.append( f"{prev_token.to_gra_tier()} {token.to_gra_tier()}" ) else: token_in_table.append(token.word) if "%mor" in mor_gra_keys: token_in_table.append(token.to_mor_tier()) if "%gra" in mor_gra_keys: token_in_table.append(token.to_gra_tier()) prev_token = token tokens_in_table.append(token_in_table) tokens_in_table_with_keys = [ [f"*{self.participant}:"] + [f"{key}:" for key in mor_gra_keys], *tokens_in_table, ] # Transpose (see https://stackoverflow.com/a/6473724) tiers_in_table = list(map(list, zip(*tokens_in_table_with_keys))) str_for_u = f"{tabulate(tiers_in_table, tablefmt='plain')}\n" else: str_for_u = f"*{self.participant}:\t{self.tiers[self.participant]}\n" for key in mor_gra_keys: str_for_u += f"{key}:\t{self.tiers[key]}\n" keys = _sort_keys(self.tiers.keys(), drop={self.participant, "%mor", "%gra"}) for key in keys: str_for_u += f"{key}:\t{self.tiers[key]}\n" return str_for_u def _repr_html_(self): html = "" # Row from words cells = [ f' <td style="text-align: left">{t.word}</td>\n' for t in self.tokens ] html += ( " <tr>\n" f" <td>*{self.participant}:</td>\n" f"{''.join(cells)}" " </tr>\n" ) # Row from %mor if "%mor" in self.tiers: cells = [ f' <td style="text-align: left">{t.to_mor_tier()}</td>\n' for t in self.tokens ] html += " <tr>\n" " <td>%mor:</td>\n" f"{''.join(cells)}" " </tr>\n" # Row from %gra if "%gra" in self.tiers: cells = [ f' <td style="text-align: left">{t.to_gra_tier()}</td>\n' for t in self.tokens ] html += " <tr>\n" " <td>%gra:</td>\n" f"{''.join(cells)}" " </tr>\n" keys = _sort_keys(self.tiers.keys(), drop={self.participant, "%mor", "%gra"}) for key in keys: html += ( f" <tr>\n" f" <td>{key}:</td>\n" f' <td colspan="{len(self.tokens)}" style="text-align: left">' f"{self.tiers[key]}</td>\n" f" </tr>\n" ) return f"<table>{html}</table>"
def _sort_keys(keys, *, first=None, drop=None) -> List[str]: sorted_keys = [] first = first or [] drop = set(drop or []) # ordering doesn't matter for key in first: if key in keys: sorted_keys.append(key) for key in keys: if not (key in sorted_keys or key in drop): sorted_keys.append(key) return sorted_keys