diff --git a/docs/user/index.rst b/docs/user/index.rst index 617e8b30571..11e2c10a45a 100644 --- a/docs/user/index.rst +++ b/docs/user/index.rst @@ -51,6 +51,7 @@ Read the Docs: documentation simplified /reference/cdn /reference/sitemaps /reference/404-not-found + /reference/llms /reference/robots .. toctree:: diff --git a/docs/user/reference/features.rst b/docs/user/reference/features.rst index b6bf1884c1b..a62754aaa6b 100644 --- a/docs/user/reference/features.rst +++ b/docs/user/reference/features.rst @@ -59,6 +59,11 @@ Feature reference We provide a default 404 page, but you can also customize it. +⏩️ :doc:`/reference/llms` + ``llms.txt`` files communicate expectations to LLM-focused crawlers. + We provide a default file, + but you can also customize it. + ⏩️ :doc:`/reference/robots` `robots.txt` files allow you to customize how your documentation is indexed in search engines. We provide a default robots.txt file, diff --git a/docs/user/reference/llms.rst b/docs/user/reference/llms.rst new file mode 100644 index 00000000000..457485538a5 --- /dev/null +++ b/docs/user/reference/llms.rst @@ -0,0 +1,54 @@ +``llms.txt`` support +==================== + +The `llms.txt` files describe how large language model crawlers can use your documentation. +They're useful for: + +* Signaling which parts of your site should be avoided by AI-focused crawlers. +* Documenting how models can attribute your content. +* Sharing links (like a sitemap) that help LLM-powered crawlers discover content responsibly. + +Read the Docs automatically generates one for you with a configuration that works for most projects. +By default, the automatically created ``llms.txt``: + +* Hides versions which are set to :ref:`Hidden ` from being indexed by LLM crawlers. +* Allows crawling of all other versions. + +.. warning:: + + ``llms.txt`` files are a signal to cooperating crawlers, + but they aren't a guarantee that your pages will not be ingested. + If you require *private* documentation, please see :doc:`/commercial/sharing`. + +How it works +------------ + +You can customize this file to add more rules to it. +The ``llms.txt`` file will be served from the **default version** of your project. +This is because the ``llms.txt`` file is served at the top-level of your domain, +so we must choose a version to find the file in. +The **default version** is the best place to look for it. + +Tool integration +---------------- + +Documentation tools will have different ways of generating an ``llms.txt`` file. +We have examples for some of the most popular tools below. + +.. tabs:: + + .. tab:: Sphinx + + Sphinx uses the `html_extra_path`_ configuration value to add static files to its final HTML output. + You need to create a ``llms.txt`` file and put it under the path defined in ``html_extra_path``. + + .. tab:: MkDocs + + MkDocs needs the ``llms.txt`` to be at the directory defined by the `docs_dir`_ configuration value. + +.. _html_extra_path: https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-html_extra_path +.. _docs_dir: https://www.mkdocs.org/user-guide/configuration/#docs_dir + +.. seealso:: + + :doc:`/reference/robots` diff --git a/readthedocs/proxito/README.rst b/readthedocs/proxito/README.rst index f6b76033130..d4f5894754a 100644 --- a/readthedocs/proxito/README.rst +++ b/readthedocs/proxito/README.rst @@ -144,6 +144,8 @@ What can/can't be cached? - ServeRobotsTXT: can be cached, we don't serve a custom robots.txt to any user if the default version is private. +- ServeLLMSTXT: can be cached, we don't serve a custom llms.txt + to any user if the default version is private. - ServeSitemapXML: can be cached. It displays only public versions, for everyone. - ServeStaticFiles: can be cached, all files are the same for all projects and users. - Embed API: can be cached for public versions. diff --git a/readthedocs/proxito/tests/base.py b/readthedocs/proxito/tests/base.py index 869a54fe994..5bbaf4003f2 100644 --- a/readthedocs/proxito/tests/base.py +++ b/readthedocs/proxito/tests/base.py @@ -6,6 +6,7 @@ from django.contrib.auth.models import User from readthedocs.storage import get_storage_class from django.test import TestCase +from django.test.utils import override_settings from readthedocs.builds.constants import LATEST from readthedocs.projects.constants import PUBLIC, SSL_STATUS_VALID @@ -13,8 +14,16 @@ from readthedocs.proxito.views import serve +proxito_middleware = list(settings.MIDDLEWARE) + [ + "readthedocs.proxito.middleware.ProxitoMiddleware", +] + + @pytest.mark.proxito +@override_settings(ROOT_URLCONF="readthedocs.proxito.urls", MIDDLEWARE=proxito_middleware) class BaseDocServing(TestCase): + urls = "readthedocs.proxito.urls" + def setUp(self): # Re-initialize storage # Various tests override either this setting or various aspects of the storage engine diff --git a/readthedocs/proxito/tests/test_full.py b/readthedocs/proxito/tests/test_full.py index 02486647143..853fe9c5f7d 100644 --- a/readthedocs/proxito/tests/test_full.py +++ b/readthedocs/proxito/tests/test_full.py @@ -952,6 +952,44 @@ def test_custom_robots_txt_private_version(self): ) self.assertEqual(response.status_code, 404) + @mock.patch.object(BuildMediaFileSystemStorageTest, "exists") + def test_default_llms_txt(self, storage_exists): + storage_exists.return_value = False + self.project.versions.update(active=True, built=True) + response = self.client.get( + reverse("llms_txt"), headers={"host": "project.readthedocs.io"} + ) + self.assertEqual(response.status_code, 200) + expected = dedent( + """ + User-agent: * + + Disallow: # Allow everything + + Sitemap: https://project.readthedocs.io/sitemap.xml + """ + ).lstrip() + self.assertContains(response, expected) + + def test_custom_llms_txt(self): + self.project.versions.update(active=True, built=True) + response = self.client.get( + reverse("llms_txt"), headers={"host": "project.readthedocs.io"} + ) + self.assertEqual( + response["x-accel-redirect"], + "/proxito/media/html/project/latest/llms.txt", + ) + + def test_custom_llms_txt_private_version(self): + self.project.versions.update( + active=True, built=True, privacy_level=constants.PRIVATE + ) + response = self.client.get( + reverse("llms_txt"), headers={"host": "project.readthedocs.io"} + ) + self.assertEqual(response.status_code, 404) + def test_directory_indexes(self): self.project.versions.update(active=True, built=True) diff --git a/readthedocs/proxito/tests/test_headers.py b/readthedocs/proxito/tests/test_headers.py index 3ebfd019ef5..a791c50bb4d 100644 --- a/readthedocs/proxito/tests/test_headers.py +++ b/readthedocs/proxito/tests/test_headers.py @@ -346,6 +346,15 @@ def test_cache_headers_robots_txt_with_private_projects_not_allowed(self): self.assertEqual(r["CDN-Cache-Control"], "public") self.assertEqual(r["Cache-Tag"], "project,project:robots.txt") + @override_settings(ALLOW_PRIVATE_REPOS=False) + def test_cache_headers_llms_txt_with_private_projects_not_allowed(self): + r = self.client.get( + "/llms.txt", secure=True, headers={"host": "project.dev.readthedocs.io"} + ) + self.assertEqual(r.status_code, 200) + self.assertEqual(r["CDN-Cache-Control"], "public") + self.assertEqual(r["Cache-Tag"], "project,project:llms.txt") + @override_settings(ALLOW_PRIVATE_REPOS=True) def test_cache_headers_robots_txt_with_private_projects_allowed(self): r = self.client.get( @@ -355,6 +364,15 @@ def test_cache_headers_robots_txt_with_private_projects_allowed(self): self.assertEqual(r["CDN-Cache-Control"], "public") self.assertEqual(r["Cache-Tag"], "project,project:robots.txt") + @override_settings(ALLOW_PRIVATE_REPOS=True) + def test_cache_headers_llms_txt_with_private_projects_allowed(self): + r = self.client.get( + "/llms.txt", secure=True, headers={"host": "project.dev.readthedocs.io"} + ) + self.assertEqual(r.status_code, 200) + self.assertEqual(r["CDN-Cache-Control"], "public") + self.assertEqual(r["Cache-Tag"], "project,project:llms.txt") + @override_settings(ALLOW_PRIVATE_REPOS=False) def test_cache_headers_robots_txt_with_private_projects_not_allowed(self): r = self.client.get( diff --git a/readthedocs/proxito/urls.py b/readthedocs/proxito/urls.py index 94ebe6ed46f..fc016af9fae 100644 --- a/readthedocs/proxito/urls.py +++ b/readthedocs/proxito/urls.py @@ -47,6 +47,7 @@ from readthedocs.proxito.views.hosting import ReadTheDocsConfigJson from readthedocs.proxito.views.serve import ServeDocs from readthedocs.proxito.views.serve import ServeError404 +from readthedocs.proxito.views.serve import ServeLLMSTXT from readthedocs.proxito.views.serve import ServePageRedirect from readthedocs.proxito.views.serve import ServeRobotsTXT from readthedocs.proxito.views.serve import ServeSitemapXML @@ -133,6 +134,7 @@ name="proxito_404_handler", ), re_path(r"robots\.txt$", ServeRobotsTXT.as_view(), name="robots_txt"), + re_path(r"llms\.txt$", ServeLLMSTXT.as_view(), name="llms_txt"), re_path(r"sitemap\.xml$", ServeSitemapXML.as_view(), name="sitemap_xml"), ] diff --git a/readthedocs/proxito/views/serve.py b/readthedocs/proxito/views/serve.py index 085b714d082..38db34f7c6e 100644 --- a/readthedocs/proxito/views/serve.py +++ b/readthedocs/proxito/views/serve.py @@ -736,6 +736,107 @@ class ServeRobotsTXT(SettingsOverrideObject): _default_class = ServeRobotsTXTBase +class ServeLLMSTXTBase(CDNCacheControlMixin, CDNCacheTagsMixin, ServeDocsMixin, View): + """Serve llms.txt from the domain's root.""" + + cache_response = True + project_cache_tag = "llms.txt" + + def get(self, request): + """ + Serve custom user's defined ``/llms.txt``. + + If the project is delisted or is a spam project, we force a special llms.txt. + + If the user added a ``llms.txt`` in the "default version" of the + project, we serve it directly. + """ + + project = request.unresolved_domain.project + + if project.delisted: + return render( + request, + "llms.delisted.txt", + content_type="text/plain", + ) + + if "readthedocsext.spamfighting" in settings.INSTALLED_APPS: + from readthedocsext.spamfighting.utils import is_robotstxt_denied # noqa + + if is_robotstxt_denied(project): + return render( + request, + "llms.spam.txt", + content_type="text/plain", + ) + + version_slug = project.get_default_version() + version = project.versions.get(slug=version_slug) + + no_serve_llms_txt = any( + [ + version.privacy_level == PRIVATE, + not version.active, + not version.built, + ] + ) + + if no_serve_llms_txt: + raise Http404() + + structlog.contextvars.bind_contextvars( + project_slug=project.slug, + version_slug=version.slug, + ) + + try: + response = self._serve_docs( + request=request, + project=project, + version=version, + filename="llms.txt", + check_if_exists=True, + ) + log.info("Serving custom llms.txt file.") + return response + except StorageFileNotFound: + pass + + sitemap_url = "{scheme}://{domain}/sitemap.xml".format( + scheme="https", + domain=project.subdomain(), + ) + context = { + "sitemap_url": sitemap_url, + "hidden_paths": self._get_hidden_paths(project), + } + return render( + request, + "llms.txt", + context, + content_type="text/plain", + ) + + def _get_hidden_paths(self, project): + hidden_versions = project.versions(manager=INTERNAL).public().filter(hidden=True) + resolver = Resolver() + hidden_paths = [ + resolver.resolve_path(project, version_slug=version.slug) for version in hidden_versions + ] + return hidden_paths + + def _get_project(self): + return self.request.unresolved_domain.project + + def _get_version(self): + return None + + +class ServeLLMSTXT(SettingsOverrideObject): + _default_class = ServeLLMSTXTBase + + class ServeSitemapXMLBase(CDNCacheControlMixin, CDNCacheTagsMixin, View): """Serve sitemap.xml from the domain's root.""" diff --git a/readthedocs/templates/llms.delisted.txt b/readthedocs/templates/llms.delisted.txt new file mode 100644 index 00000000000..279b872c434 --- /dev/null +++ b/readthedocs/templates/llms.delisted.txt @@ -0,0 +1,4 @@ +# Delisted project, blocking large language model crawlers +# See: https://docs.readthedocs.io/en/stable/unofficial-projects.html +User-agent: * +Disallow: / diff --git a/readthedocs/templates/llms.spam.txt b/readthedocs/templates/llms.spam.txt new file mode 100644 index 00000000000..7e2fe691dc7 --- /dev/null +++ b/readthedocs/templates/llms.spam.txt @@ -0,0 +1,3 @@ +# Spam project detected, blocking large language model crawlers +User-agent: * +Disallow: / diff --git a/readthedocs/templates/llms.txt b/readthedocs/templates/llms.txt new file mode 100644 index 00000000000..289b359a429 --- /dev/null +++ b/readthedocs/templates/llms.txt @@ -0,0 +1,15 @@ +# This llms.txt file is autogenerated by Read the Docs. +# It describes how large language model crawlers can use your documentation. +# +# You can learn more about llms.txt, including how to customize it, in our documentation: +# +# * Our documentation on llms.txt: https://docs.readthedocs.com/platform/stable/reference/llms.html +# * Our guide about SEO techniques: https://docs.readthedocs.com/platform/stable/guides/technical-docs-seo-guide.html + +User-agent: * +{% for path in hidden_paths %} +Disallow: {{ path }} # Hidden version +{% empty %} +Disallow: # Allow everything +{% endfor %} +Sitemap: {{ sitemap_url }}