diff --git a/nginx.conf b/nginx.conf index 02f67b612a..58f6224779 100644 --- a/nginx.conf +++ b/nginx.conf @@ -534,8 +534,36 @@ server { rewrite ^/platform/actors/development/source-code$ /platform/actors/development/deployment/source-types redirect; # Academy restructuring - rewrite ^academy/advanced-web-scraping/scraping-paginated-sites$ /academy/advanced-web-scraping/crawling/crawling-with-search permanent; - rewrite ^academy/php$ /academy/php/use-apify-from-php redirect; # not permanent in case we want to reuse /php in the future + rewrite ^/academy/advanced-web-scraping/scraping-paginated-sites$ /academy/advanced-web-scraping/crawling/crawling-with-search permanent; + rewrite ^/academy/php$ /academy/php/use-apify-from-php redirect; # not permanent in case we want to reuse /php in the future + + # Academy: replacing the 'Web Scraping for Beginners' course + rewrite ^/academy/web-scraping-for-beginners/best-practices$ /academy/scraping-basics-javascript?legacy-js-course=/best-practices permanent; + rewrite ^/academy/web-scraping-for-beginners/introduction$ /academy/scraping-basics-javascript?legacy-js-course=/introduction permanent; + rewrite ^/academy/web-scraping-for-beginners/challenge/initializing-and-setting-up$ /academy/scraping-basics-javascript?legacy-js-course=/challenge/initializing-and-setting-up permanent; + rewrite ^/academy/web-scraping-for-beginners/challenge/modularity$ /academy/scraping-basics-javascript?legacy-js-course=/challenge/modularity permanent; + rewrite ^/academy/web-scraping-for-beginners/challenge/scraping-amazon$ /academy/scraping-basics-javascript?legacy-js-course=/challenge/scraping-amazon permanent; + rewrite ^/academy/web-scraping-for-beginners/challenge$ /academy/scraping-basics-javascript?legacy-js-course=/challenge permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling/exporting-data$ /academy/scraping-basics-javascript/framework?legacy-js-course=/crawling/exporting-data permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling/filtering-links$ /academy/scraping-basics-javascript/getting-links?legacy-js-course=/crawling/filtering-links permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling/finding-links$ /academy/scraping-basics-javascript/getting-links?legacy-js-course=/crawling/finding-links permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling/first-crawl$ /academy/scraping-basics-javascript/crawling?legacy-js-course=/crawling/first-crawl permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling/headless-browser$ /academy/scraping-basics-javascript?legacy-js-course=/crawling/headless-browser permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling/pro-scraping$ /academy/scraping-basics-javascript/framework?legacy-js-course=/crawling/pro-scraping permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling/recap-extraction-basics$ /academy/scraping-basics-javascript/extracting-data?legacy-js-course=/crawling/recap-extraction-basics permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling/relative-urls$ /academy/scraping-basics-javascript/getting-links?legacy-js-course=/crawling/relative-urls permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling/scraping-the-data$ /academy/scraping-basics-javascript/scraping-variants?legacy-js-course=/crawling/scraping-the-data permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling$ /academy/scraping-basics-javascript/crawling?legacy-js-course=/crawling permanent; + rewrite ^/academy/web-scraping-for-beginners/data-extraction/browser-devtools$ /academy/scraping-basics-javascript/devtools-inspecting?legacy-js-course=/data-extraction/browser-devtools permanent; + rewrite ^/academy/web-scraping-for-beginners/data-extraction/computer-preparation$ /academy/scraping-basics-javascript/downloading-html?legacy-js-course=/data-extraction/computer-preparation permanent; + rewrite ^/academy/web-scraping-for-beginners/data-extraction/devtools-continued$ /academy/scraping-basics-javascript/devtools-extracting-data?legacy-js-course=/data-extraction/devtools-continued permanent; + rewrite ^/academy/web-scraping-for-beginners/data-extraction/node-continued$ /academy/scraping-basics-javascript/extracting-data?legacy-js-course=/data-extraction/node-continued permanent; + rewrite ^/academy/web-scraping-for-beginners/data-extraction/node-js-scraper$ /academy/scraping-basics-javascript/downloading-html?legacy-js-course=/data-extraction/node-js-scraper permanent; + rewrite ^/academy/web-scraping-for-beginners/data-extraction/project-setup$ /academy/scraping-basics-javascript/downloading-html?legacy-js-course=/data-extraction/project-setup permanent; + rewrite ^/academy/web-scraping-for-beginners/data-extraction/save-to-csv$ /academy/scraping-basics-javascript/saving-data?legacy-js-course=/data-extraction/save-to-csv permanent; + rewrite ^/academy/web-scraping-for-beginners/data-extraction/using-devtools$ /academy/scraping-basics-javascript/devtools-locating-elements?legacy-js-course=/data-extraction/using-devtools permanent; + rewrite ^/academy/web-scraping-for-beginners/data-extraction$ /academy/scraping-basics-javascript/devtools-inspecting?legacy-js-course=/data-extraction permanent; + rewrite ^/academy/web-scraping-for-beginners$ /academy/scraping-basics-javascript?legacy-js-course=/ permanent; # Removed pages # GPT plugins were discontinued April 9th, 2024 - https://help.openai.com/en/articles/8988022-winding-down-the-chatgpt-plugins-beta diff --git a/sources/academy/glossary/concepts/robot_process_automation.md b/sources/academy/glossary/concepts/robot_process_automation.md index 2fc55424b6..3671fe7cb6 100644 --- a/sources/academy/glossary/concepts/robot_process_automation.md +++ b/sources/academy/glossary/concepts/robot_process_automation.md @@ -1,5 +1,5 @@ --- -title: Robotic process automation +title: What is robotic process automation (RPA) description: Learn the basics of robotic process automation. Make your processes on the web and other software more efficient by automating repetitive tasks. sidebar_position: 8.7 slug: /concepts/robotic-process-automation @@ -29,7 +29,7 @@ With the advance of [machine learning](https://en.wikipedia.org/wiki/Machine_lea ## Is RPA the same as web scraping? {#is-rpa-the-same-as-web-scraping} -While [web scraping](../../webscraping/scraping_basics_javascript/index.md) is a kind of RPA, it focuses on extracting structured data. RPA focuses on the other tasks in browsers - everything except for extracting information. +While web scraping is a kind of RPA, it focuses on extracting structured data. RPA focuses on the other tasks in browsers - everything except for extracting information. ## Additional resources {#additional-resources} diff --git a/sources/academy/glossary/tools/apify_cli.md b/sources/academy/glossary/tools/apify_cli.md index 032ff99b9e..8e75bb76be 100644 --- a/sources/academy/glossary/tools/apify_cli.md +++ b/sources/academy/glossary/tools/apify_cli.md @@ -13,7 +13,7 @@ The [Apify CLI](/cli) helps you create, develop, build and run Apify Actors, and ## Installing {#installing} -To install the Apify CLI, you'll first need npm, which comes preinstalled with Node.js. If you haven't yet installed Node, [learn how to do that](../../webscraping/scraping_basics_javascript/data_extraction/computer_preparation.md). Additionally, make sure you've got an Apify account, as you will need to log in to the CLI to gain access to its full potential. +To install the Apify CLI, you'll first need npm, which comes preinstalled with Node.js. Additionally, make sure you've got an Apify account, as you will need to log in to the CLI to gain access to its full potential. Open up a terminal instance and run the following command: diff --git a/sources/academy/homepage_content.json b/sources/academy/homepage_content.json index 0d02329bdb..7283287c98 100644 --- a/sources/academy/homepage_content.json +++ b/sources/academy/homepage_content.json @@ -2,7 +2,7 @@ "Beginner courses": [ { "title": "Web scraping basics with JS", - "link": "/academy/web-scraping-for-beginners", + "link": "/academy/scraping-basics-javascript", "description": "Learn how to use JavaScript to extract information from websites in this practical course, starting from the absolute basics.", "imageUrl": "/img/academy/scraping-basics-javascript.svg" }, diff --git a/sources/academy/platform/expert_scraping_with_apify/actors_webhooks.md b/sources/academy/platform/expert_scraping_with_apify/actors_webhooks.md index 97f2a8ca3c..c5c7d1d4a8 100644 --- a/sources/academy/platform/expert_scraping_with_apify/actors_webhooks.md +++ b/sources/academy/platform/expert_scraping_with_apify/actors_webhooks.md @@ -1,19 +1,26 @@ --- -title: I - Webhooks & advanced Actor overview +title: Webhooks & advanced Actor overview description: Learn more advanced details about Actors, how they work, and the default configurations they can take. Also, learn how to integrate your Actor with webhooks. sidebar_position: 6.1 +sidebar_label: I - Webhooks & advanced Actor overview slug: /expert-scraping-with-apify/actors-webhooks --- **Learn more advanced details about Actors, how they work, and the default configurations they can take. Also, learn how to integrate your Actor with webhooks.** +:::caution Updates coming + +This lesson is subject to change because it currently relies on code from our archived **Web scraping basics for JavaScript devs** course. For now you can still access the archived course, but we plan to completely retire it in a few months. This lesson will be updated to remove the dependency. + +::: + --- Thus far, you've run Actors on the platform and written an Actor of your own, which you published to the platform yourself using the Apify CLI; therefore, it's fair to say that you are becoming more familiar and comfortable with the concept of **Actors**. Within this lesson, we'll take a more in-depth look at Actors and what they can do. ## Advanced Actor overview {#advanced-actors} -In this course, we'll be working out of the Amazon scraper project from the **Web scraping basics for JavaScript devs** course. If you haven't already built that project, you can do it in [three short lessons](../../webscraping/scraping_basics_javascript/challenge/index.md). We've made a few small modifications to the project with the Apify SDK, but 99% of the code is still the same. +In this course, we'll be working out of the Amazon scraper project from the **Web scraping basics for JavaScript devs** course. If you haven't already built that project, you can do it in [three short lessons](../../webscraping/scraping_basics_legacy/challenge/index.md). We've made a few small modifications to the project with the Apify SDK, but 99% of the code is still the same. Take another look at the files within your Amazon scraper project. You'll notice that there is a **Dockerfile**. Every single Actor has a Dockerfile (the Actor's **Image**) which tells Docker how to spin up a container on the Apify platform which can successfully run the Actor's code. "Apify Actors" is a serverless platform that runs multiple Docker containers. For a deeper understanding of Actor Dockerfiles, refer to the [Apify Actor Dockerfile docs](/sdk/js/docs/guides/docker-images#example-dockerfile). @@ -39,7 +46,7 @@ Prior to moving forward, please read over these resources: ## Our task {#our-task} -In this task, we'll be building on top of what we already created in the [Web scraping basics for JavaScript devs](/academy/web-scraping-for-beginners/challenge) course's final challenge, so keep those files safe! +In this task, we'll be building on top of what we already created in the [Web scraping basics for JavaScript devs](../../webscraping/scraping_basics_legacy/challenge/index.md) course's final challenge, so keep those files safe! Once our Amazon Actor has completed its run, we will, rather than sending an email to ourselves, call an Actor through a webhook. The Actor called will be a new Actor that we will create together, which will take the dataset ID as input, then subsequently filter through all of the results and return only the cheapest one for each product. All of the results of the Actor will be pushed to its default dataset. diff --git a/sources/academy/platform/expert_scraping_with_apify/apify_api_and_client.md b/sources/academy/platform/expert_scraping_with_apify/apify_api_and_client.md index cb5a1fd734..15e5e42266 100644 --- a/sources/academy/platform/expert_scraping_with_apify/apify_api_and_client.md +++ b/sources/academy/platform/expert_scraping_with_apify/apify_api_and_client.md @@ -1,7 +1,8 @@ --- -title: IV - Apify API & client +title: Apify API & client description: Gain an in-depth understanding of the two main ways of programmatically interacting with the Apify platform - through the API, and through a client. sidebar_position: 6.4 +sidebar_label: IV - Apify API & client slug: /expert-scraping-with-apify/apify-api-and-client --- diff --git a/sources/academy/platform/expert_scraping_with_apify/bypassing_anti_scraping.md b/sources/academy/platform/expert_scraping_with_apify/bypassing_anti_scraping.md index 4f6f8757b5..e5f034e1aa 100644 --- a/sources/academy/platform/expert_scraping_with_apify/bypassing_anti_scraping.md +++ b/sources/academy/platform/expert_scraping_with_apify/bypassing_anti_scraping.md @@ -1,7 +1,8 @@ --- -title: VI - Bypassing anti-scraping methods +title: Bypassing anti-scraping methods description: Learn about bypassing anti-scraping methods using proxies and proxy/session rotation together with Crawlee and the Apify SDK. sidebar_position: 6.6 +sidebar_label: VI - Bypassing anti-scraping methods slug: /expert-scraping-with-apify/bypassing-anti-scraping --- diff --git a/sources/academy/platform/expert_scraping_with_apify/index.md b/sources/academy/platform/expert_scraping_with_apify/index.md index 6c1be153b5..87bb4a7178 100644 --- a/sources/academy/platform/expert_scraping_with_apify/index.md +++ b/sources/academy/platform/expert_scraping_with_apify/index.md @@ -18,13 +18,9 @@ Before developing a pro-level Apify scraper, there are some important things you > If you've already gone through the [Web scraping basics for JavaScript devs](../../webscraping/scraping_basics_javascript/index.md) and the first courses of the [Apify platform category](../apify_platform.md), you will be more than well equipped to continue on with the lessons in this course. - - ### Crawlee, Apify SDK, and the Apify CLI {#crawlee-apify-sdk-and-cli} -If you're feeling ambitious, you don't need to have any prior experience with Crawlee to get started with this course; however, at least 5–10 minutes of exposure is recommended. If you haven't yet tried out Crawlee, you can refer to [this lesson](../../webscraping/scraping_basics_javascript/crawling/pro_scraping.md) in the **Web scraping basics for JavaScript devs** course (and ideally follow along). To familiarize yourself with the Apify SDK, you can refer to the [Apify Platform](../apify_platform.md) category. +If you're feeling ambitious, you don't need to have any prior experience with Crawlee to get started with this course; however, at least 5–10 minutes of exposure is recommended. If you haven't yet tried out Crawlee, you can refer to the [Using a scraping framework with Node.js](../../webscraping/scraping_basics_javascript/12_framework.md) lesson of the **Web scraping basics for JavaScript devs** course. To familiarize yourself with the Apify SDK, you can refer to the [Apify Platform](../apify_platform.md) category. The Apify CLI will play a core role in the running and testing of the Actor you will build, so if you haven't gotten it installed already, please refer to [this short lesson](../../glossary/tools/apify_cli.md). diff --git a/sources/academy/platform/expert_scraping_with_apify/managing_source_code.md b/sources/academy/platform/expert_scraping_with_apify/managing_source_code.md index f8e4255e90..e9cf36d342 100644 --- a/sources/academy/platform/expert_scraping_with_apify/managing_source_code.md +++ b/sources/academy/platform/expert_scraping_with_apify/managing_source_code.md @@ -1,7 +1,8 @@ --- -title: II - Managing source code +title: Managing source code description: Learn how to manage your Actor's source code more efficiently by integrating it with a GitHub repository. This is standard on the Apify platform. sidebar_position: 6.2 +sidebar_label: II - Managing source code slug: /expert-scraping-with-apify/managing-source-code --- diff --git a/sources/academy/platform/expert_scraping_with_apify/migrations_maintaining_state.md b/sources/academy/platform/expert_scraping_with_apify/migrations_maintaining_state.md index fa6eb61f1b..707e64fd7d 100644 --- a/sources/academy/platform/expert_scraping_with_apify/migrations_maintaining_state.md +++ b/sources/academy/platform/expert_scraping_with_apify/migrations_maintaining_state.md @@ -1,7 +1,8 @@ --- -title: V - Migrations & maintaining state +title: Migrations & maintaining state description: Learn about what Actor migrations are and how to handle them properly so that the state is not lost and runs can safely be resurrected. sidebar_position: 6.5 +sidebar_label: V - Migrations & maintaining state slug: /expert-scraping-with-apify/migrations-maintaining-state --- diff --git a/sources/academy/platform/expert_scraping_with_apify/saving_useful_stats.md b/sources/academy/platform/expert_scraping_with_apify/saving_useful_stats.md index 1cd8feca90..6bc13433f1 100644 --- a/sources/academy/platform/expert_scraping_with_apify/saving_useful_stats.md +++ b/sources/academy/platform/expert_scraping_with_apify/saving_useful_stats.md @@ -1,7 +1,8 @@ --- -title: VII - Saving useful run statistics +title: Saving useful run statistics description: Understand how to save statistics about an Actor's run, what types of statistics you can save, and why you might want to save them for a large-scale scraper. sidebar_position: 6.7 +sidebar_label: VII - Saving useful run statistics slug: /expert-scraping-with-apify/saving-useful-stats --- diff --git a/sources/academy/platform/expert_scraping_with_apify/solutions/handling_migrations.md b/sources/academy/platform/expert_scraping_with_apify/solutions/handling_migrations.md index 056bcb736f..24399f8df6 100644 --- a/sources/academy/platform/expert_scraping_with_apify/solutions/handling_migrations.md +++ b/sources/academy/platform/expert_scraping_with_apify/solutions/handling_migrations.md @@ -1,7 +1,8 @@ --- -title: V - Handling migrations +title: Handling migrations description: Get real-world experience of maintaining a stateful object stored in memory, which will be persisted through migrations and even graceful aborts. sidebar_position: 5 +sidebar_label: V - Handling migrations slug: /expert-scraping-with-apify/solutions/handling-migrations --- diff --git a/sources/academy/platform/expert_scraping_with_apify/solutions/integrating_webhooks.md b/sources/academy/platform/expert_scraping_with_apify/solutions/integrating_webhooks.md index d90ba7f76f..926dc70131 100644 --- a/sources/academy/platform/expert_scraping_with_apify/solutions/integrating_webhooks.md +++ b/sources/academy/platform/expert_scraping_with_apify/solutions/integrating_webhooks.md @@ -1,12 +1,19 @@ --- -title: I - Integrating webhooks +title: Integrating webhooks description: Learn how to integrate webhooks into your Actors. Webhooks are a super powerful tool, and can be used to do almost anything! sidebar_position: 1 +sidebar_label: I - Integrating webhooks slug: /expert-scraping-with-apify/solutions/integrating-webhooks --- **Learn how to integrate webhooks into your Actors. Webhooks are a super powerful tool, and can be used to do almost anything!** +:::caution Updates coming + +This lesson is subject to change because it currently relies on code from our archived **Web scraping basics for JavaScript devs** course. For now you can still access the archived course, but we plan to completely retire it in a few months. This lesson will be updated to remove the dependency. + +::: + --- In this lesson we'll be writing a new Actor and integrating it with our beloved Amazon scraping Actor. First, we'll navigate to the same directory where our **demo-actor** folder lives, and run `apify create filter-actor` _(once again, you can name the Actor whatever you want, but for this lesson, we'll be calling the new Actor **filter-actor**)_. When prompted about the programming language, select **JavaScript**: diff --git a/sources/academy/platform/expert_scraping_with_apify/solutions/managing_source.md b/sources/academy/platform/expert_scraping_with_apify/solutions/managing_source.md index 5ccc2b42a9..9fed4b5d2d 100644 --- a/sources/academy/platform/expert_scraping_with_apify/solutions/managing_source.md +++ b/sources/academy/platform/expert_scraping_with_apify/solutions/managing_source.md @@ -1,7 +1,8 @@ --- -title: II - Managing source +title: Managing source description: View in-depth answers for all three of the quiz questions that were provided in the corresponding lesson about managing source code. sidebar_position: 2 +sidebar_label: II - Managing source slug: /expert-scraping-with-apify/solutions/managing-source --- diff --git a/sources/academy/platform/expert_scraping_with_apify/solutions/rotating_proxies.md b/sources/academy/platform/expert_scraping_with_apify/solutions/rotating_proxies.md index 7b0193f248..88755208eb 100644 --- a/sources/academy/platform/expert_scraping_with_apify/solutions/rotating_proxies.md +++ b/sources/academy/platform/expert_scraping_with_apify/solutions/rotating_proxies.md @@ -1,7 +1,8 @@ --- -title: VI - Rotating proxies/sessions +title: Rotating proxies/sessions description: Learn firsthand how to rotate proxies and sessions in order to avoid the majority of the most common anti-scraping protections. sidebar_position: 6 +sidebar_label: VI - Rotating proxies/sessions slug: /expert-scraping-with-apify/solutions/rotating-proxies --- diff --git a/sources/academy/platform/expert_scraping_with_apify/solutions/saving_stats.md b/sources/academy/platform/expert_scraping_with_apify/solutions/saving_stats.md index b46ec068cc..3915dee01c 100644 --- a/sources/academy/platform/expert_scraping_with_apify/solutions/saving_stats.md +++ b/sources/academy/platform/expert_scraping_with_apify/solutions/saving_stats.md @@ -1,7 +1,8 @@ --- -title: VII - Saving run stats +title: Saving run stats description: Implement the saving of general statistics about an Actor's run, as well as adding request-specific statistics to dataset items. sidebar_position: 7 +sidebar_label: VII - Saving run stats slug: /expert-scraping-with-apify/solutions/saving-stats --- diff --git a/sources/academy/platform/expert_scraping_with_apify/solutions/using_api_and_client.md b/sources/academy/platform/expert_scraping_with_apify/solutions/using_api_and_client.md index dce2ba016f..e2cf1cf0d3 100644 --- a/sources/academy/platform/expert_scraping_with_apify/solutions/using_api_and_client.md +++ b/sources/academy/platform/expert_scraping_with_apify/solutions/using_api_and_client.md @@ -1,7 +1,8 @@ --- -title: IV - Using the Apify API & JavaScript client +title: Using the Apify API & JavaScript client description: Learn how to interact with the Apify API directly through the well-documented RESTful routes, or by using the proprietary Apify JavaScript client. sidebar_position: 4 +sidebar_label: IV - Using the Apify API & JavaScript client slug: /expert-scraping-with-apify/solutions/using-api-and-client --- diff --git a/sources/academy/platform/expert_scraping_with_apify/solutions/using_storage_creating_tasks.md b/sources/academy/platform/expert_scraping_with_apify/solutions/using_storage_creating_tasks.md index 91528d5c5c..4df5c3d43f 100644 --- a/sources/academy/platform/expert_scraping_with_apify/solutions/using_storage_creating_tasks.md +++ b/sources/academy/platform/expert_scraping_with_apify/solutions/using_storage_creating_tasks.md @@ -1,7 +1,8 @@ --- -title: III - Using storage & creating tasks +title: Using storage & creating tasks description: Get quiz answers and explanations for the lesson about using storage and creating tasks on the Apify platform. sidebar_position: 3 +sidebar_label: III - Using storage & creating tasks slug: /expert-scraping-with-apify/solutions/using-storage-creating-tasks --- diff --git a/sources/academy/platform/expert_scraping_with_apify/tasks_and_storage.md b/sources/academy/platform/expert_scraping_with_apify/tasks_and_storage.md index 22c15a5eaa..ea5640c14d 100644 --- a/sources/academy/platform/expert_scraping_with_apify/tasks_and_storage.md +++ b/sources/academy/platform/expert_scraping_with_apify/tasks_and_storage.md @@ -1,7 +1,8 @@ --- -title: III - Tasks & storage +title: Tasks & storage description: Understand how to save the configurations for Actors with Actor tasks. Also, learn about storage and the different types Apify offers. sidebar_position: 6.3 +sidebar_label: III - Tasks & storage slug: /expert-scraping-with-apify/tasks-and-storage --- diff --git a/sources/academy/tutorials/node_js/analyzing_pages_and_fixing_errors.md b/sources/academy/tutorials/node_js/analyzing_pages_and_fixing_errors.md index 316beb5ee2..d2314a7bd7 100644 --- a/sources/academy/tutorials/node_js/analyzing_pages_and_fixing_errors.md +++ b/sources/academy/tutorials/node_js/analyzing_pages_and_fixing_errors.md @@ -69,8 +69,6 @@ try { } ``` -Read more information about logging and error handling in our developer [best practices](../../webscraping/scraping_basics_javascript/best_practices.md) section. - ### Saving snapshots {#saving-snapshots} By snapshots, we mean **screenshots** if you use a [browser with Puppeteer/Playwright](../../webscraping/puppeteer_playwright/index.md) and HTML saved into a [key-value store](https://crawlee.dev/api/core/class/KeyValueStore) that you can display in your own browser. Snapshots are useful throughout your code but especially important in error handling. diff --git a/sources/academy/tutorials/node_js/dealing_with_dynamic_pages.md b/sources/academy/tutorials/node_js/dealing_with_dynamic_pages.md index f5a691fdeb..d1c1b60ecc 100644 --- a/sources/academy/tutorials/node_js/dealing_with_dynamic_pages.md +++ b/sources/academy/tutorials/node_js/dealing_with_dynamic_pages.md @@ -41,7 +41,7 @@ If you're in a brand new project, don't forget to initialize your project, then npm init -y && npm i crawlee ``` -Now, let's write some data extraction code to extract each product's data. This should look familiar if you went through the [Data Extraction](../../webscraping/scraping_basics_javascript/data_extraction/index.md) lessons: +Now, let's write some data extraction code to extract each product's data. This should look familiar if you went through the [Web scraping basics for JavaScript devs](/academy/scraping-basics-javascript) course: ```js import { CheerioCrawler } from 'crawlee'; diff --git a/sources/academy/webscraping/advanced_web_scraping/crawling/sitemaps-vs-search.md b/sources/academy/webscraping/advanced_web_scraping/crawling/sitemaps-vs-search.md index f34b24d261..5fea5d36df 100644 --- a/sources/academy/webscraping/advanced_web_scraping/crawling/sitemaps-vs-search.md +++ b/sources/academy/webscraping/advanced_web_scraping/crawling/sitemaps-vs-search.md @@ -5,7 +5,7 @@ sidebar_position: 1 slug: /advanced-web-scraping/crawling/sitemaps-vs-search --- -The core crawling problem comes to down to ensuring that we reliably find all detail pages on the target website or inside its categories. This is trivial for small sites. We just open the home page or category pages and paginate to the end as we did in the [Web scraping basics for JavaScript devs](/academy/web-scraping-for-beginners) course. +The core crawling problem comes to down to ensuring that we reliably find all detail pages on the target website or inside its categories. This is trivial for small sites. We just open the home page or category pages and paginate to the end. Unfortunately, _most modern websites restrict pagination_ only to somewhere between 1 and 10,000 products. Solving this problem might seem relatively straightforward at first but there are multiple hurdles that we will explore in this lesson. @@ -31,7 +31,7 @@ Sitemap is usually a simple XML file that contains a list of all pages on the we - _Does not directly reflect the website_ - There is no way you can ensure that all pages on the website are in the sitemap. The sitemap also can contain pages that were already removed and will return 404s. This is a major downside of sitemaps which prevents us from using them as the only source of URLs. - _Updated in intervals_ - Sitemaps are usually not updated in real-time. This means that you might miss some pages if you scrape them too soon after they were added to the website. Common update intervals are 1 day or 1 week. - _Hard to find or unavailable_ - Sitemaps are not always trivial to locate. They can be deployed on a CDN with unpredictable URLs. Sometimes they are not available at all. -- _Streamed, compressed, and archived_ - Sitemaps are often streamed and archived with .tgz extensions and compressed with gzip. This means that you cannot use default HTTP client settings and must handle these cases with extra code or use a scraping framework. +- _Streamed, compressed, and archived_ - Sitemaps are often streamed and archived with .tgz extensions and compressed with Gzip. This means that you cannot use default HTTP client settings and must handle these cases with extra code or use a scraping framework. ## Pros and cons of categories, search, and filters diff --git a/sources/academy/webscraping/advanced_web_scraping/index.md b/sources/academy/webscraping/advanced_web_scraping/index.md index fe58884117..33ffc603b0 100644 --- a/sources/academy/webscraping/advanced_web_scraping/index.md +++ b/sources/academy/webscraping/advanced_web_scraping/index.md @@ -6,7 +6,7 @@ category: web scraping & automation slug: /advanced-web-scraping --- -In the [Web scraping basics for JavaScript devs](/academy/web-scraping-for-beginners) course, we have learned the necessary basics required to create a scraper. In the following courses, we learned more about specific practices and techniques that will help us to solve most of the problems we will face. +In the [Web scraping basics for JavaScript devs](/academy/scraping-basics-javascript) course, we have learned the necessary basics required to create a scraper. In the following courses, we learned more about specific practices and techniques that will help us to solve most of the problems we will face. In this course, we will take all of that knowledge, add a few more advanced concepts, and apply them to learn how to build a production-ready web scraper. diff --git a/sources/academy/webscraping/anti_scraping/mitigation/using_proxies.md b/sources/academy/webscraping/anti_scraping/mitigation/using_proxies.md index f824153761..7254193448 100644 --- a/sources/academy/webscraping/anti_scraping/mitigation/using_proxies.md +++ b/sources/academy/webscraping/anti_scraping/mitigation/using_proxies.md @@ -7,15 +7,21 @@ slug: /anti-scraping/mitigation/using-proxies **Learn how to use and automagically rotate proxies in your scrapers by using Crawlee, and a bit about how to obtain pools of proxies.** +:::caution Updates coming + +This lesson is subject to change because it currently relies on code from our archived **Web scraping basics for JavaScript devs** course. For now you can still access the archived course, but we plan to completely retire it in a few months. This lesson will be updated to remove the dependency. + +::: + --- -In the [**Web scraping basics for JavaScript devs**](../../scraping_basics_javascript/crawling/pro_scraping.md) course, we learned about the power of Crawlee, and how it can streamline the development process of web crawlers. You've already seen how powerful the `crawlee` package is; however, what you've been exposed to thus far is only the tip of the iceberg. +In the [**Web scraping basics for JavaScript devs**](../../scraping_basics_legacy/crawling/pro_scraping.md) course, we learned about the power of Crawlee, and how it can streamline the development process of web crawlers. You've already seen how powerful the `crawlee` package is; however, what you've been exposed to thus far is only the tip of the iceberg. Because proxies are so widely used in the scraping world, Crawlee has built-in features for implementing them in an effective way. One of the main functionalities that comes baked into Crawlee is proxy rotation, which is when each request is sent through a different proxy from a proxy pool. ## Implementing proxies in a scraper {#implementing-proxies} -Let's borrow some scraper code from the end of the [pro-scraping](../../scraping_basics_javascript/crawling/pro_scraping.md) lesson in our **Web scraping basics for JavaScript devs** course and paste it into a new file called **proxies.js**. This code enqueues all of the product links on [demo-webstore.apify.org](https://demo-webstore.apify.org)'s on-sale page, then makes a request to each product page and scrapes data about each one: +Let's borrow some scraper code from the end of the [pro-scraping](../../scraping_basics_legacy/crawling/pro_scraping.md) lesson in our **Web scraping basics for JavaScript devs** course and paste it into a new file called **proxies.js**. This code enqueues all of the product links on [demo-webstore.apify.org](https://demo-webstore.apify.org)'s on-sale page, then makes a request to each product page and scrapes data about each one: ```js // crawlee.js diff --git a/sources/academy/webscraping/puppeteer_playwright/executing_scripts/extracting_data.md b/sources/academy/webscraping/puppeteer_playwright/executing_scripts/extracting_data.md index 95313816cb..f2ebab8b8d 100644 --- a/sources/academy/webscraping/puppeteer_playwright/executing_scripts/extracting_data.md +++ b/sources/academy/webscraping/puppeteer_playwright/executing_scripts/extracting_data.md @@ -12,7 +12,7 @@ import TabItem from '@theme/TabItem'; --- -Now that we know how to execute scripts on a page, we're ready to learn a bit about [data extraction](../../scraping_basics_javascript/data_extraction/index.md). In this lesson, we'll be scraping all the on-sale products from our [Fakestore](https://demo-webstore.apify.org/search/on-sale) website. Playwright & Puppeteer offer two main methods for data extraction: +Now that we know how to execute scripts on a page, we're ready to learn a bit about data extraction. In this lesson, we'll be scraping all the on-sale products from our [Fakestore](https://demo-webstore.apify.org/search/on-sale) website. Playwright & Puppeteer offer two main methods for data extraction: 1. Directly in `page.evaluate()` and other evaluate functions such as `page.$$eval()`. 2. In the Node.js context using a parsing library such as [Cheerio](https://www.npmjs.com/package/cheerio) diff --git a/sources/academy/webscraping/puppeteer_playwright/index.md b/sources/academy/webscraping/puppeteer_playwright/index.md index 18343b5cea..57c62f560f 100644 --- a/sources/academy/webscraping/puppeteer_playwright/index.md +++ b/sources/academy/webscraping/puppeteer_playwright/index.md @@ -61,8 +61,6 @@ npm install puppeteer -> For a more in-depth guide on how to set up the basic environment we'll be using in this tutorial, check out the [**Computer preparation**](../scraping_basics_javascript/data_extraction/computer_preparation.md) lesson in the **Web scraping basics for JavaScript devs** course - ## Course overview {#course-overview} 1. [Launching a browser](./browser.md) diff --git a/sources/academy/webscraping/puppeteer_playwright/page/interacting_with_a_page.md b/sources/academy/webscraping/puppeteer_playwright/page/interacting_with_a_page.md index 99d0ecbbbb..ce247e84e6 100644 --- a/sources/academy/webscraping/puppeteer_playwright/page/interacting_with_a_page.md +++ b/sources/academy/webscraping/puppeteer_playwright/page/interacting_with_a_page.md @@ -53,7 +53,7 @@ With `page.click()`, Puppeteer and Playwright actually drag the mouse and click, Notice that in the Playwright example, we are using a different selector than in the Puppeteer example. This is because Playwright supports [many custom CSS selectors](https://playwright.dev/docs/other-locators#css-elements-matching-one-of-the-conditions), such as the **has-text** pseudo class. As a rule of thumb, using text selectors is much more preferable to using regular selectors, as they are much less likely to break. If Google makes the sibling above the **Accept all** button a `
` element instead of a `