DHCodeReview · jdamerow · May 6, 2024 · May 6, 2024 · May 25, 2024 · May 25, 2024
diff --git a/__init__.py b/__init__.py
@@ -0,0 +1,333 @@
+"""__init__.py.
+
+Last Update: May 29 2024
+"""
+
+import re
+from collections.abc import Iterator
+from dataclasses import dataclass
+from typing import Callable, Iterable, List, Union
+
+import spacy
+from spacy.tokens.doc import Doc
+from timer import timer
+
+from rollingwindows import helpers
+from rollingwindows.registry import rw_components
+
+
+def get_rw_component(id: str):
+    """Get a component from the registry by id.
+
+    Args:
+        id (str): The registry id of the component
+
+    Returns:
+        The component class.
+    """
+    return rw_components.get(id)
+
+
+def sliding_str_windows(
+    input: Union[List[spacy.tokens.span.Span], spacy.tokens.doc.Doc, str],
+    n: int = 1000,
+    alignment_mode: str = "contract",
+) -> Iterator:
+    """Return a generator of string windows.
+
+    Args:
+        input (Union[List[spacy.tokens.span.Span], spacy.tokens.doc.Doc, str]): A spaCy doc or a list of spaCy spans.
+        n (int): The size of the window.
+        window_units (str): The type of units to use ("characters", "tokens", "lines", "sentences", "spans").
+        alignment_mode (str): How character indices snap to token boundaries.
+        - "strict" (no snapping)
+        - "contract" (span of all tokens completely within the character span)
+        - "expand" (span of all tokens at least partially covered by the character span)
+
+    Returns:
+        A generator of window strings.
+
+    Note:
+        Window boundaries are snapped to token boundaries in the original doc.
+        "Contract" means that the window will contain all tokens completely
+        within the boundaries of `i:i + n`. "Expand" means that window will
+        contain all tokens partially withn those boundaries. Setting
+        `alignment_mode="strict"` in `doc.char_span()` is not advised
+        because it returns `None` for any string that cannot be aligned to
+        token boundaries. As a result, a slice method is used if you want
+        to simply cut the text strictly on `n` characters.
+    """
+    # TODO: We have to iterate through the input twice to get the boundaries.
+    if isinstance(input, list):
+        input_spans = [span.as_doc() for span in input]
+        boundaries = [(i, i + n) for i in range(len(input_spans))]
+        for start, end in boundaries:
+            yield Doc.from_docs(input_spans[start:end]).text.strip()
+    else:
+        if isinstance(input, str):
+            alignment_mode = "strict"
+            boundaries = [(i, i + n) for i in range(len(input))]
+        else:
+            boundaries = [(i, i + n) for i in range(len(input.text))]
+        if alignment_mode == "strict":
+            for start_char, end_char in boundaries:
+                span = input[start_char:end_char]
+                if span is not None:
+                    yield span.text
+        else:
+            for start_char, end_char in boundaries:
+                span = input.char_span(
+                    start_char, end_char, alignment_mode=alignment_mode
+                )
+                if span is not None:
+                    yield span.text
+
+
+def sliding_windows(
+    input: Union[List[spacy.tokens.span.Span], spacy.tokens.doc.Doc],
+    n: int = 1000,
+    window_units: str = "characters",
+    alignment_mode: str = "strict",
+) -> Iterator:
+    """Create the windows generator.
+
+    Args:
+        input (Union[List[spacy.tokens.span.Span], spacy.tokens.doc.Doc]): A spaCy doc or a list of spaCy spans.
+        n (int): The size of the window.
+        window_units (str): The type of units to use ("characters", "tokens", "lines", "sentences", "spans").
+        alignment_mode (str): How character indices snap to token boundaries.
+        - "strict" (no snapping)
+        - "contract" (span of all tokens completely within the character span)
+        - "expand" (span of all tokens at least partially covered by the character span)
+
+    Yields:
+        Iterator: A generator of sliding windows.
+    """
+    # Process character windows
+    if window_units == "characters":
+        boundaries = [(i, i + n) for i in range(len(input.text))]
+        if alignment_mode == "strict":
+            for start_char, end_char in boundaries:
+                yield input.text[start_char:end_char]
+        else:
+            for start_char, end_char in boundaries:
+                window = input.char_span(
+                    start_char, end_char, alignment_mode=alignment_mode
+                )
+                if window is not None:
+                    yield window.text
+
+    # Process span and token windows
+    elif window_units in ["lines", "sentences", "spans", "tokens"]:
+        boundaries = [(i, i + n) for i in range(len(input))]
+        for start, end in boundaries:
+            yield input[start:end]
+    else:
+        raise Exception("Invalid window unit.")
+
+
+# Windows class
+@dataclass
+class Windows:
+    """A dataclass for storing rolling windows."""
+    windows: Iterable
+    window_units: str
+    n: int
+    alignment_mode: str = "strict"
+
+    def __iter__(self):
+        """Iterate over the windows."""
+        return iter(self.windows)
+
+
+# RollingWindows class
+class RollingWindows:
+    """A class for managing a rolling windows workflow."""
+
+    def __init__(
+        self,
+        doc: spacy.tokens.doc.Doc,
+        model: str,
+        *,
+        patterns: Union[list, str] = None,
+    ):
+        """Initialise a RollingWindows object.
+
+        Args:
+            doc (spacy.tokens.doc.Doc): A spaCy Doc object.
+            model (str): The name of a spaCy model.
+            patterns (Union[list, str]): The patterns to match.
+        """
+        self.doc = doc
+        self.nlp = spacy.load(model)
+        if patterns:
+            self.patterns = helpers.ensure_list(patterns)
+        else:
+            self.patterns = []
+        self.metadata = {"model": model}
+
+    def _get_search_method(self, window_units: str = None) -> str:
+        """Get the search method based on the window type.
+
+        Args:
+            window_units (str): The type of window unit.
+
+        Returns:
+            str: The preliminary search method
+        """
+        methods = {
+            "characters": "count",
+            "tokens": "spacy_matcher",
+            "lines": "spacy_matcher",
+            "sentences": "spacy_matcher",
+        }
+        return methods.get(window_units, "re_finditer")
+
+    def _get_units(
+        self, doc: spacy.tokens.doc.Doc, window_units: str = "characters"
+    ) -> Union[List[spacy.tokens.span.Span], spacy.tokens.doc.Doc]:
+        """Get a list of characters, sentences, lines, or tokens.
+
+        Args:
+            doc (spacy.tokens.doc.Doc): A list of spaCy spans or docs.
+            window_units (str): "characters", "lines", "sentences", or "tokens".
+
+        Returns:
+            Union[List[spacy.tokens.span.Span], spacy.tokens.doc.Doc]: A list of spaCy spans or the original doc
+        """
+        if window_units == "sentences":
+            if doc.has_annotation("SENT_START"):
+                return list(doc.sents)
+        elif window_units == "lines":
+            regex = r"^(.+)\n+|(.+)\n+|(.+)$"
+            lines = []
+            for match in re.finditer(regex, doc.text):
+                start, end = match.span()
+                span = doc.char_span(start, end)
+                if span is not None:
+                    lines.append(span)
+            return lines
+        else:
+            return doc
+
+    @timer
+    def calculate(
+        self,
+        patterns: Union[list, str] = None,
+        calculator: Union[Callable, str] = "rw_calculator",
+        query: str = "counts",
+        show_spacy_rules: bool = False,
+    ) -> None:
+        """Set up a calculator.
+
+        Args:
+            patterns: (Union[list, str]): The patterns to search for.
+                        calculator (Union[Callable, str]): The calculator to use.
+            query (str): String designating whether to return "counts", "averages", or "ratios".
+                        show_spacy_rules (bool): Whether to use spaCy rules or strings in column labels
+        """
+        if not hasattr(self, "windows"):
+            raise Exception("You must call set_windows() before running calculations.")
+        else:
+            if calculator:
+                # Use the "averages" calculator with the default config
+                if isinstance(calculator, str):
+                    if patterns is not None:
+                        self.patterns = patterns
+                    calculator = get_rw_component(calculator)
+                    calculator = calculator(
+                        patterns=self.patterns, windows=self.windows, query=query
+                    )
+            calculator.run(query=calculator.query)
+            self.metadata["calculator"] = calculator.metadata
+            self.result = calculator.to_df(show_spacy_rules=show_spacy_rules)
+
+    def plot(
+        self,
+        plotter: Union[Callable, str] = "rw_simple_plotter",
+        show: bool = False,
+        file: str = None,
+        **kwargs,
+    ) -> None:
+        """Set up the plotter.
+
+        Args:
+            plotter (Union[Callable, str]): The plotter to use.
+            show (bool): Whether to show the generated figure.
+            file (str): The filepath to save the file, if desired.
+        """
+        if not hasattr(self, "result") or self.result is None:
+            raise Exception(
+                "You must run a calculator on your data before generating a plot."
+            )
+        # Use the "rw_simple_plotter" plotter with the default config
+        if isinstance(plotter, str):
+            plotter = get_rw_component(plotter)
+            plotter = plotter()
+        plotter.run(self.result)
+        self.metadata["plotter"] = plotter.metadata
+        self.fig = plotter.fig
+        if show:
+            plotter.show(**kwargs)
+        if file:
+            plotter.save(file, **kwargs)
+
+    # @timer
+    def set_windows(
+        self,
+        n: int = 1000,
+        window_units: str = "characters",
+        *,
+        alignment_mode: str = "strict",
+        filter: Union[Callable, str] = None,
+    ) -> None:
+        """Set the object's windows.
+
+        Args:
+            n (int): The number of windows to calculate
+            window_units (str): "characters", "lines", "sentences", or "tokens".
+            alignment_mode (str): How character indices snap to token boundaries.
+            - "strict" (no snapping)
+            - "contract" (span of all tokens completely within the character span)
+            - "expand" (span of all tokens at least partially covered by the character span)
+            filter (Union[Callable, str]): The name of a filter or a filter object to apply to the document.
+        """
+        if filter:
+            # Use the filter with the default config
+            if isinstance(filter, str):
+                filter = get_rw_component(filter)
+                filter = filter(self.doc)
+            doc = filter.apply()
+        else:
+            doc = self.doc
+        # _get_units() returns either a doc or a list of spans. The doc is used to slide over
+        # characters or tokens, and the list is used to slide over sentences or lines.
+        input = self._get_units(doc, window_units)
+        # sliding_windows() returns a generator containing with string or span windows.
+        if window_units == "characters":
+            if isinstance(input, list):
+                input = " ".join([span.text for span in input])
+            windows = sliding_str_windows(input, n, alignment_mode)
+        else:
+            windows = sliding_windows(input, n, window_units, alignment_mode)
+        # Since spans windows are lists of multiple spans, we need to get the first and last
+        # token from the original doc to get a window that combines them into a single span.
+        if window_units in ["lines", "sentences", "spans"]:
+            span_windows = (doc[window[0].start : window[-1].end] for window in windows)
+            self.windows = Windows(span_windows, window_units, n, alignment_mode)
+        else:
+            self.windows = Windows(windows, window_units, n, alignment_mode)
+        # For convenience's sake, we detect the search method here, but the calculator
+        # will override it based on the pattern.
+        search_method = self._get_search_method(window_units)
+        metadata = {
+            "n": n,
+            "window_units": window_units,
+            "alignment_mode": alignment_mode,
+            "search_method": search_method,
+        }
+        if filter:
+            metadata["filter"] = filter.metadata
+        else:
+            self.metadata.pop("filter", None)
+        self.metadata = self.metadata | metadata