diff --git a/backend/openedx_ai_extensions/processors/component_extractors.py b/backend/openedx_ai_extensions/processors/component_extractors.py new file mode 100644 index 00000000..8e3b682c --- /dev/null +++ b/backend/openedx_ai_extensions/processors/component_extractors.py @@ -0,0 +1,338 @@ +""" +Clean, LLM-friendly formatting for HTML, Video, Problem, and all other XBlocks. +""" + +import json +import logging +from typing import Optional + +from bs4 import BeautifulSoup # pylint: disable=import-error +from django.conf import settings + +logger = logging.getLogger(__name__) + +filters = settings.AI_EXTENSIONS_FIELD_FILTERS +ALLOWED_FIELDS = filters.get("allowed_fields", []) +ALLOWED_FIELD_SUBSTRINGS = filters.get("allowed_field_substrings", []) +# ----------------------------- +# Embedded content helpers +# ----------------------------- + + +def _extract_iframes(soup: BeautifulSoup) -> list[str]: + """ + Extract all