From f012a0b0eb554915a9c07de53a760d34e1e7a7d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 2 Nov 2022 20:06:08 +0100 Subject: [PATCH 01/13] Add SwitchPage (unfinished) --- docs/index.rst | 1 + docs/page-objects/additional-requests.rst | 1 + docs/page-objects/layouts.rst | 152 ++++++++++++++++++++++ tests/test_pages.py | 69 ++++++++++ web_poet/pages.py | 22 ++++ 5 files changed, 245 insertions(+) create mode 100644 docs/page-objects/layouts.rst diff --git a/docs/index.rst b/docs/index.rst index bf37258d..e90b8593 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -41,6 +41,7 @@ and the motivation behind ``web-poet``, start with :ref:`from-ground-up`. page-objects/additional-requests page-objects/fields page-objects/rules + Webpage layouts page-objects/retries page-objects/page-params diff --git a/docs/page-objects/additional-requests.rst b/docs/page-objects/additional-requests.rst index 930ea284..32f1797d 100644 --- a/docs/page-objects/additional-requests.rst +++ b/docs/page-objects/additional-requests.rst @@ -1,4 +1,5 @@ .. _advanced-requests: +.. _page-objects: =================== Additional Requests diff --git a/docs/page-objects/layouts.rst b/docs/page-objects/layouts.rst new file mode 100644 index 00000000..a41a0809 --- /dev/null +++ b/docs/page-objects/layouts.rst @@ -0,0 +1,152 @@ +.. _layouts: + +=============== +Webpage layouts +=============== + +Different webpages may show the same *type* of page, but different *data*. For +example, in an e-commerce website there are usually many product detail pages, +each showing data from a different product. + +The code that those webpages have in common is their **webpage layout**. + +Coding for webpage layouts +========================== + +Webpage layouts should inform how you organize your data extraction code. + +A good practice to keep your code maintainable is to have a separate :ref:`page +object class ` per webpage layout. + +Trying to support multiple webpage layouts with the same page object class can +make your class hard to maintain. + + +Identifying webpage layouts +=========================== + +There is no precise way to determine whether 2 webpages have the same or a +different webpage layout. You must decide based on what you know, and be ready +to adapt if things change. + +It is also often difficult to identify webpage layouts before you start writing +extraction code. Completely different webpage layouts can have the same look, +and very similar webpage layouts can look completely different. + +It can be a good starting point to assume that, for a given combination of +data type and website, there is going to be a single webpage layout. For +example, assume that all product pages of a given e-commerce website will have +the same webpage layout. + +Then, as you write a :ref:`page object class ` for that webpage +layout, you may find out more, and adapt. + +When the same piece of information must be extracted from a different place for +different webpages, that is a sign that you may be dealing with more than 1 +webpage layout. For example, if on some webpages the product name is in an +``h1`` element, but on some webpages it is in an ``h2`` element, chances are +there are at least 2 different webpage layouts. + +However, whether you continue to work as if everything uses the same webpage +layout, or you split your page object class into 2 page object classes, each +targetting one of the webpage layouts you have found, it is entirely up to you. + +Ask yourself: Is supporting all webpage layout differences making your page +object class implementation only a few lines of code longer, or is it making it +an unmaintainable bowl of spagetti code? + + +Mapping webpage layouts +======================= + +Once you have written a :ref:`page object class ` for a webpage +layout, you need to make it so that your page object class is used for webpages +that use that webpage layout. + +URL patterns +------------ + +Webpage layouts are often associated to specific URL patterns. For example, all +the product detail pages of an e-commerce website usually have similar URLs, +such as ``https://example.com/product/``. + +When that is the case, you can :ref:`associate your page object class to the +corresponding URL pattern `. + + +Switch page object classes +-------------------------- + +Sometimes it is impossible to know, based on the target URL, which webpage +layout you are getting. For example, during `A/B testing`_, you could get a +random webpage layout on every request. + +.. _A/B testing: https://en.wikipedia.org/wiki/A/B_testing + +For these scenarios, we recommend that you create a special “switch” page +object class, and use it to switch to the right page object class at run time +based on the input you receive. + +Your switch page object class should: + +#. Request all the inputs that the candidate page object classes may need. + + For example, if there are 2 candidate page object classes, and 1 of them + requires browser HTML as input, while the other one requires an HTTP + response, your switch page object class must request both. + + If combining different inputs is a problem, consider refactoring the + candidate page object classes to require similar inputs. + +#. On its :meth:`~web_poet.pages.ItemPage.to_item` method: + + #. Determine, based on the inputs, which candidate page object class to + use. + + #. Create an instance of the selected candidade page object class with the + necessary input, call its :meth:`~web_poet.pages.ItemPage.to_item` + method, and return its result. + +You may use :class:`~web_poet.pages.SwitchPage` as a base class for your switch +page object class, so you only need to implement the +:class:`~web_poet.pages.SwitchPage.switch` method that determines which +candidate page object class to use. For example: + +.. code-block:: python + + import attrs + from web_poet import handle_urls, HttpResponse, Injectable, ItemPage, SwitchPage + + + @attrs.define + class Header: + text: str + + + @attrs.define + class H1Page(ItemPage[Header]): + response: HttpResponse + + @field + def text(self) -> str: + return self.response.css("h1::text").get() + + + @attrs.define + class H2Page(ItemPage[Header]): + response: HttpResponse + + @field + def text(self) -> str: + return self.response.css("h2::text").get() + + + @handle_urls("example.com") + @attrs.define + class HeaderSwitchPage(SwitchPage[Header]): + response: HttpResponse + + async def switch(self) -> Injectable: + if self.response.css("h1::text"): + return H1Page + return H2Page diff --git a/tests/test_pages.py b/tests/test_pages.py index fa3cf8df..7ba3df2b 100644 --- a/tests/test_pages.py +++ b/tests/test_pages.py @@ -10,6 +10,7 @@ ItemT, ItemWebPage, Returns, + SwitchPage, WebPage, is_injectable, ) @@ -33,6 +34,74 @@ def to_item(self) -> dict: } +@pytest.mark.asyncio +async def test_switch_page_object(): + + @attrs.define + class Header: + text: str + + + @attrs.define + class H1Page(ItemPage[Header]): + response: HttpResponse + + @field + def text(self) -> str: + return self.response.css("h1::text").get() + + + @attrs.define + class H2Page(ItemPage[Header]): + response: HttpResponse + + @field + def text(self) -> str: + return self.response.css("h2::text").get() + + + @attrs.define + class HeaderSwitchPage(SwitchPage[Header]): + response: HttpResponse + + async def switch(self) -> Injectable: + if self.response.css("h1::text"): + return H1Page + return H2Page + + html_h1 = b""" + + + + h1 + + +

a

+ + + """ + html_h2 = b""" + + + + h2 + + +

b

+ + + """ + + response1 = HttpResponse("https://example.com", body=html_h1) + response2 = HttpResponse("https://example.com", body=html_h2) + + item1 = await HeaderSwitchPage(response=response1).to_item() + item2 = await HeaderSwitchPage(response=response2).to_item() + + assert item1.text == "a" + assert item2.text == "b" + + def test_web_page_object(book_list_html_response) -> None: class MyWebPage(WebPage): def to_item(self) -> dict: # type: ignore diff --git a/web_poet/pages.py b/web_poet/pages.py index 77b39af3..4bb5b758 100644 --- a/web_poet/pages.py +++ b/web_poet/pages.py @@ -68,6 +68,28 @@ async def to_item(self) -> ItemT: ) +class SwitchPage(Injectable, Returns[ItemT]): + """Base class for :ref:`switch page object classes `. + + Subclasses must reimplement the :meth:`switch` method. + """ + + @abc.abstractmethod + async def switch(self) -> Injectable: + """Return the right :ref:`page object class ` based on + the received input.""" + raise NotImplementedError + + async def to_item(self) -> ItemT: + """Create an instance of the class that :meth:`switch` returns with the + required input, and return the output of its + :meth:`~web_poet.pages.ItemPage.to_item` method.""" + page_object_class = await self.switch() + # page_object = page_object_class(...) # TODO: pass the right inputs + page_object = page_object_class(response=self.response) + return await page_object.to_item() + + @attr.s(auto_attribs=True) class WebPage(ItemPage[ItemT], ResponseShortcutsMixin): """Base Page Object which requires :class:`~.HttpResponse` From 62cd1ed91a7cf96cea69cccee716ed846d82a6f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 2 Nov 2022 20:42:20 +0100 Subject: [PATCH 02/13] Add missing reference --- docs/page-objects/layouts.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/page-objects/layouts.rst b/docs/page-objects/layouts.rst index a41a0809..beba5897 100644 --- a/docs/page-objects/layouts.rst +++ b/docs/page-objects/layouts.rst @@ -74,6 +74,8 @@ When that is the case, you can :ref:`associate your page object class to the corresponding URL pattern `. +.. _switch: + Switch page object classes -------------------------- From 31fc060b2733617222a4c2195c4c44b92f80b619 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 9 Nov 2022 12:27:43 +0100 Subject: [PATCH 03/13] Get a clean tox pass --- docs/page-objects/additional-requests.rst | 1 - tests/test_pages.py | 28 ++++++++++++----------- tox.ini | 2 +- web_poet/pages.py | 8 +++---- 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/docs/page-objects/additional-requests.rst b/docs/page-objects/additional-requests.rst index 32f1797d..930ea284 100644 --- a/docs/page-objects/additional-requests.rst +++ b/docs/page-objects/additional-requests.rst @@ -1,5 +1,4 @@ .. _advanced-requests: -.. _page-objects: =================== Additional Requests diff --git a/tests/test_pages.py b/tests/test_pages.py index 7ba3df2b..c08a5f8c 100644 --- a/tests/test_pages.py +++ b/tests/test_pages.py @@ -9,8 +9,8 @@ ItemPage, ItemT, ItemWebPage, + MultiLayoutPage, Returns, - SwitchPage, WebPage, is_injectable, ) @@ -36,38 +36,36 @@ def to_item(self) -> dict: @pytest.mark.asyncio async def test_switch_page_object(): - @attrs.define class Header: text: str - @attrs.define class H1Page(ItemPage[Header]): response: HttpResponse @field - def text(self) -> str: + def text(self) -> Optional[str]: return self.response.css("h1::text").get() - @attrs.define class H2Page(ItemPage[Header]): response: HttpResponse @field - def text(self) -> str: + def text(self) -> Optional[str]: return self.response.css("h2::text").get() - @attrs.define - class HeaderSwitchPage(SwitchPage[Header]): + class HeaderMultiLayoutPage(MultiLayoutPage[Header]): response: HttpResponse + h1: H1Page + h2: H2Page - async def switch(self) -> Injectable: + async def switch(self) -> ItemPage[Header]: if self.response.css("h1::text"): - return H1Page - return H2Page + return self.h1 + return self.h2 html_h1 = b""" @@ -93,10 +91,14 @@ async def switch(self) -> Injectable: """ response1 = HttpResponse("https://example.com", body=html_h1) + h1_1 = H1Page(response=response1) + h2_1 = H2Page(response=response1) response2 = HttpResponse("https://example.com", body=html_h2) + h1_2 = H1Page(response=response2) + h2_2 = H2Page(response=response2) - item1 = await HeaderSwitchPage(response=response1).to_item() - item2 = await HeaderSwitchPage(response=response2).to_item() + item1 = await HeaderMultiLayoutPage(response=response1, h1=h1_1, h2=h2_1).to_item() + item2 = await HeaderMultiLayoutPage(response=response2, h1=h1_2, h2=h2_2).to_item() assert item1.text == "a" assert item2.text == "b" diff --git a/tox.ini b/tox.ini index 58de4454..ffd2b8ce 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py37,py38,py39,py310,py311,mypy,docs,types +envlist = py37,py38,py39,py310,py311,mypy,docs,types,linters [pytest] asyncio_mode = strict diff --git a/web_poet/pages.py b/web_poet/pages.py index 4bb5b758..21272c73 100644 --- a/web_poet/pages.py +++ b/web_poet/pages.py @@ -68,14 +68,14 @@ async def to_item(self) -> ItemT: ) -class SwitchPage(Injectable, Returns[ItemT]): +class MultiLayoutPage(ItemPage[ItemT]): """Base class for :ref:`switch page object classes `. Subclasses must reimplement the :meth:`switch` method. """ @abc.abstractmethod - async def switch(self) -> Injectable: + async def switch(self) -> ItemPage[ItemT]: """Return the right :ref:`page object class ` based on the received input.""" raise NotImplementedError @@ -84,9 +84,7 @@ async def to_item(self) -> ItemT: """Create an instance of the class that :meth:`switch` returns with the required input, and return the output of its :meth:`~web_poet.pages.ItemPage.to_item` method.""" - page_object_class = await self.switch() - # page_object = page_object_class(...) # TODO: pass the right inputs - page_object = page_object_class(response=self.response) + page_object = await self.switch() return await page_object.to_item() From 78178135ae35adfcbe46d91457a052a87f4a17c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 9 Nov 2022 12:55:53 +0100 Subject: [PATCH 04/13] Refactor into the proposed API improvement --- docs/page-objects/layouts.rst | 78 +++++++++++++++++++---------------- tests/test_pages.py | 16 +++---- web_poet/pages.py | 18 ++++---- 3 files changed, 58 insertions(+), 54 deletions(-) diff --git a/docs/page-objects/layouts.rst b/docs/page-objects/layouts.rst index beba5897..60f2894e 100644 --- a/docs/page-objects/layouts.rst +++ b/docs/page-objects/layouts.rst @@ -49,11 +49,11 @@ there are at least 2 different webpage layouts. However, whether you continue to work as if everything uses the same webpage layout, or you split your page object class into 2 page object classes, each -targetting one of the webpage layouts you have found, it is entirely up to you. +targeting one of the webpage layouts you have found, it is entirely up to you. Ask yourself: Is supporting all webpage layout differences making your page object class implementation only a few lines of code longer, or is it making it -an unmaintainable bowl of spagetti code? +an unmaintainable bowl of spaghetti code? Mapping webpage layouts @@ -74,10 +74,10 @@ When that is the case, you can :ref:`associate your page object class to the corresponding URL pattern `. -.. _switch: +.. _multi-layout: -Switch page object classes --------------------------- +Multi-layout page object classes +-------------------------------- Sometimes it is impossible to know, based on the target URL, which webpage layout you are getting. For example, during `A/B testing`_, you could get a @@ -85,39 +85,47 @@ random webpage layout on every request. .. _A/B testing: https://en.wikipedia.org/wiki/A/B_testing -For these scenarios, we recommend that you create a special “switch” page -object class, and use it to switch to the right page object class at run time -based on the input you receive. +For these scenarios, we recommend that you create different page object classes +for the different layouts that you may get, and then write a special +“multi-layout” page object class, and use it to select the right page object +class at run time based on the input you receive. -Your switch page object class should: +Your multi-layout page object class should: -#. Request all the inputs that the candidate page object classes may need. +#. Declare attributes for the input that you will need to determine which page + object class to use. - For example, if there are 2 candidate page object classes, and 1 of them - requires browser HTML as input, while the other one requires an HTTP - response, your switch page object class must request both. + For example, declare an :class:`HttpResponse` attribute to select a page + object class based on the response content. - If combining different inputs is a problem, consider refactoring the - candidate page object classes to require similar inputs. +#. Declare an attribute for every page object class that you may depending on + which webpage layout you get from the target website. + + Note that all inputs of all those page object classes will be resolved and + requested along with the input of your multi-layout page object class. For + example, if one page object class requires browser HTML as input, while + another requires an HTTP response, your multi-layout page object class asks + for both inputs. + + If combining different inputs is a problem, consider refactoring your page + object classes to require similar inputs. #. On its :meth:`~web_poet.pages.ItemPage.to_item` method: - #. Determine, based on the inputs, which candidate page object class to - use. + #. Determine, based on inputs, which page object to use. - #. Create an instance of the selected candidade page object class with the - necessary input, call its :meth:`~web_poet.pages.ItemPage.to_item` - method, and return its result. + #. Return the output of the :meth:`~web_poet.pages.ItemPage.to_item` + method of that page object. -You may use :class:`~web_poet.pages.SwitchPage` as a base class for your switch -page object class, so you only need to implement the -:class:`~web_poet.pages.SwitchPage.switch` method that determines which -candidate page object class to use. For example: +You may use :class:`~web_poet.pages.MultiLayoutPage` as a base class for your +multi-layout page object class, so you only need to implement the +:class:`~web_poet.pages.MultiLayoutPage.layout` method that determines which +page object to use. For example: .. code-block:: python import attrs - from web_poet import handle_urls, HttpResponse, Injectable, ItemPage, SwitchPage + from web_poet import handle_urls, HttpResponse, ItemPage, MultiLayoutPage, WebPage @attrs.define @@ -126,29 +134,29 @@ candidate page object class to use. For example: @attrs.define - class H1Page(ItemPage[Header]): - response: HttpResponse + class H1Page(WebPage[Header]): @field def text(self) -> str: - return self.response.css("h1::text").get() + return self.css("h1::text").get() @attrs.define - class H2Page(ItemPage[Header]): - response: HttpResponse + class H2Page(WebPage[Header]): @field def text(self) -> str: - return self.response.css("h2::text").get() + return self.css("h2::text").get() @handle_urls("example.com") @attrs.define - class HeaderSwitchPage(SwitchPage[Header]): + class HeaderMultiLayoutPage(MultiLayoutPage[Header]): response: HttpResponse + h1: H1Page + h2: H2Page - async def switch(self) -> Injectable: + async def layout(self) -> ItemPage[Header]: if self.response.css("h1::text"): - return H1Page - return H2Page + return self.h1 + return self.h2 diff --git a/tests/test_pages.py b/tests/test_pages.py index c08a5f8c..aec1bc55 100644 --- a/tests/test_pages.py +++ b/tests/test_pages.py @@ -35,26 +35,22 @@ def to_item(self) -> dict: @pytest.mark.asyncio -async def test_switch_page_object(): +async def test_multi_layout_page_object(): @attrs.define class Header: text: str @attrs.define - class H1Page(ItemPage[Header]): - response: HttpResponse - + class H1Page(WebPage[Header]): @field def text(self) -> Optional[str]: - return self.response.css("h1::text").get() + return self.css("h1::text").get() @attrs.define - class H2Page(ItemPage[Header]): - response: HttpResponse - + class H2Page(WebPage[Header]): @field def text(self) -> Optional[str]: - return self.response.css("h2::text").get() + return self.css("h2::text").get() @attrs.define class HeaderMultiLayoutPage(MultiLayoutPage[Header]): @@ -62,7 +58,7 @@ class HeaderMultiLayoutPage(MultiLayoutPage[Header]): h1: H1Page h2: H2Page - async def switch(self) -> ItemPage[Header]: + async def layout(self) -> ItemPage[Header]: if self.response.css("h1::text"): return self.h1 return self.h2 diff --git a/web_poet/pages.py b/web_poet/pages.py index 21272c73..47ff8522 100644 --- a/web_poet/pages.py +++ b/web_poet/pages.py @@ -69,22 +69,22 @@ async def to_item(self) -> ItemT: class MultiLayoutPage(ItemPage[ItemT]): - """Base class for :ref:`switch page object classes `. + """Base class for :ref:`multi-layout page object classes `. - Subclasses must reimplement the :meth:`switch` method. + Subclasses must reimplement the :meth:`layout` method. """ @abc.abstractmethod - async def switch(self) -> ItemPage[ItemT]: - """Return the right :ref:`page object class ` based on - the received input.""" + async def layout(self) -> ItemPage[ItemT]: + """Return the :ref:`page object ` to use based on the + received input.""" raise NotImplementedError async def to_item(self) -> ItemT: - """Create an instance of the class that :meth:`switch` returns with the - required input, and return the output of its - :meth:`~web_poet.pages.ItemPage.to_item` method.""" - page_object = await self.switch() + """Return the output of the :meth:`~web_poet.pages.ItemPage.to_item` + method of the :ref:`page object ` that :meth:`layout` + returns.""" + page_object = await self.layout() return await page_object.to_item() From 36ad05629ff9414bd34989d56772e18a8e2068b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 9 Nov 2022 12:59:15 +0100 Subject: [PATCH 05/13] Remove unneeded NotImplementedError --- web_poet/pages.py | 1 - 1 file changed, 1 deletion(-) diff --git a/web_poet/pages.py b/web_poet/pages.py index 47ff8522..21ff0e48 100644 --- a/web_poet/pages.py +++ b/web_poet/pages.py @@ -78,7 +78,6 @@ class MultiLayoutPage(ItemPage[ItemT]): async def layout(self) -> ItemPage[ItemT]: """Return the :ref:`page object ` to use based on the received input.""" - raise NotImplementedError async def to_item(self) -> ItemT: """Return the output of the :meth:`~web_poet.pages.ItemPage.to_item` From b5d61ea600896ffbe666e775624ac020136b9318 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 9 Nov 2022 13:01:20 +0100 Subject: [PATCH 06/13] Add MultiLayoutPage to the API reference --- docs/api-reference.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/api-reference.rst b/docs/api-reference.rst index 59e6f843..9b928ea0 100644 --- a/docs/api-reference.rst +++ b/docs/api-reference.rst @@ -55,6 +55,10 @@ Pages :show-inheritance: :members: +.. autoclass:: MultiLayoutPage + :show-inheritance: + :members: layout + Mixins ====== From 6778aa34f6ced6b3cce92653f05426c5ff4a9a12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 9 Nov 2022 13:24:17 +0100 Subject: [PATCH 07/13] Mention priority resolution on the multi-layout page object class documentation --- docs/page-objects/layouts.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/page-objects/layouts.rst b/docs/page-objects/layouts.rst index 60f2894e..683152ba 100644 --- a/docs/page-objects/layouts.rst +++ b/docs/page-objects/layouts.rst @@ -160,3 +160,8 @@ page object to use. For example: if self.response.css("h1::text"): return self.h1 return self.h2 + +.. note:: If you use :func:`~web_poet.handle_urls` both for your multi-layout + page object class and for any of the page object classes that it + uses, you may need to :ref:`grant your multi-layour page object class + a higher priority `. From 073c4ab764e65e783fdf0a8b59f000bf366e8cd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 9 Nov 2022 13:27:36 +0100 Subject: [PATCH 08/13] Clarify that the layouts under a multi-layout should return the same type of item --- docs/page-objects/layouts.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/page-objects/layouts.rst b/docs/page-objects/layouts.rst index 683152ba..5833aaf7 100644 --- a/docs/page-objects/layouts.rst +++ b/docs/page-objects/layouts.rst @@ -98,8 +98,11 @@ Your multi-layout page object class should: For example, declare an :class:`HttpResponse` attribute to select a page object class based on the response content. -#. Declare an attribute for every page object class that you may depending on - which webpage layout you get from the target website. +#. Declare an attribute for every page object class that you may use depending + on which webpage layout you get from the target website. + + They all should return the same type of :ref:`item ` as your + multi-layout page object class. Note that all inputs of all those page object classes will be resolved and requested along with the input of your multi-layout page object class. For @@ -163,5 +166,5 @@ page object to use. For example: .. note:: If you use :func:`~web_poet.handle_urls` both for your multi-layout page object class and for any of the page object classes that it - uses, you may need to :ref:`grant your multi-layour page object class + uses, you may need to :ref:`grant your multi-layout page object class a higher priority `. From b51f056d7e258c68198cede755453f779484d350 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 22 Nov 2022 07:12:48 +0100 Subject: [PATCH 09/13] Remove unnecessary @attrs.define Co-authored-by: Kevin Lloyd Bernal --- docs/page-objects/layouts.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/page-objects/layouts.rst b/docs/page-objects/layouts.rst index 5833aaf7..bb8abdc1 100644 --- a/docs/page-objects/layouts.rst +++ b/docs/page-objects/layouts.rst @@ -131,7 +131,6 @@ page object to use. For example: from web_poet import handle_urls, HttpResponse, ItemPage, MultiLayoutPage, WebPage - @attrs.define class Header: text: str From 308c4bfb9f002aca3747f532037761e2af2541fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 22 Nov 2022 07:57:40 +0100 Subject: [PATCH 10/13] Apply feedback --- docs/page-objects/layouts.rst | 122 ++++++++++++++++++++++++++-------- tests/test_pages.py | 2 - web_poet/pages.py | 2 +- 3 files changed, 94 insertions(+), 32 deletions(-) diff --git a/docs/page-objects/layouts.rst b/docs/page-objects/layouts.rst index bb8abdc1..a6200384 100644 --- a/docs/page-objects/layouts.rst +++ b/docs/page-objects/layouts.rst @@ -96,7 +96,13 @@ Your multi-layout page object class should: object class to use. For example, declare an :class:`HttpResponse` attribute to select a page - object class based on the response content. + object class based on the response content: + + .. code-block:: python + + class MyMultiLayoutPage(ItemPage): + response: HttpResponse + ... #. Declare an attribute for every page object class that you may use depending on which webpage layout you get from the target website. @@ -104,11 +110,51 @@ Your multi-layout page object class should: They all should return the same type of :ref:`item ` as your multi-layout page object class. + For example: + + .. code-block:: python + + class MyItem: + ... + + @attrs.define + class MyPage1(ItemPage[MyItem]): + ... + + @attrs.define + class MyPage2(ItemPage[MyItem]): + ... + + @attrs.define + class MyMultiLayoutPage(ItemPage[MyItem]): + ... + page1: MyPage1 + page2: MyPage2 + Note that all inputs of all those page object classes will be resolved and - requested along with the input of your multi-layout page object class. For - example, if one page object class requires browser HTML as input, while - another requires an HTTP response, your multi-layout page object class asks - for both inputs. + requested along with the input of your multi-layout page object class. + + For example, given: + + .. code-block:: python + + @attrs.define + class MyPage1(ItemPage): + response: HttpResponse + + @attrs.define + class MyPage2(ItemPage): + response: BrowserHtml + + @attrs.define + class MyMultiLayoutPage(ItemPage): + response: HttpResponse + page1: MyPage1 + page2: MyPage2 + + Using ``MyMultiLayoutPage`` causes the use of both ``HttpResponse`` and + ``BrowserHtml``, because ``MyMultiLayoutPage`` requires ``MyPage2``, and + ``MyPage2`` requires ``BrowserHtml``. If combining different inputs is a problem, consider refactoring your page object classes to require similar inputs. @@ -120,6 +166,23 @@ Your multi-layout page object class should: #. Return the output of the :meth:`~web_poet.pages.ItemPage.to_item` method of that page object. + For example: + + .. code-block:: python + + @attrs.define + class MyMultiLayoutPage(ItemPage[MyItem]): + response: HttpResponse + page1: MyPage1 + page2: MyPage2 + + async def to_item(self) -> MyItem: + if self.response.css(".foo"): + page_object = self.page1 + else: + page_object = self.page2 + return await page_object.to_item() + You may use :class:`~web_poet.pages.MultiLayoutPage` as a base class for your multi-layout page object class, so you only need to implement the :class:`~web_poet.pages.MultiLayoutPage.layout` method that determines which @@ -127,41 +190,42 @@ page object to use. For example: .. code-block:: python - import attrs - from web_poet import handle_urls, HttpResponse, ItemPage, MultiLayoutPage, WebPage + from typing import Optional + + import attrs + from web_poet import handle_urls, HttpResponse, ItemPage, MultiLayoutPage, WebPage - class Header: - text: str + @attrs.define + class Header: + text: str - @attrs.define - class H1Page(WebPage[Header]): + class H1Page(WebPage[Header]): - @field - def text(self) -> str: - return self.css("h1::text").get() + @field + def text(self) -> Optional[str]: + return self.css("h1::text").get() - @attrs.define - class H2Page(WebPage[Header]): + class H2Page(WebPage[Header]): - @field - def text(self) -> str: - return self.css("h2::text").get() + @field + def text(self) -> Optional[str]: + return self.css("h2::text").get() - @handle_urls("example.com") - @attrs.define - class HeaderMultiLayoutPage(MultiLayoutPage[Header]): - response: HttpResponse - h1: H1Page - h2: H2Page + @handle_urls("example.com") + @attrs.define + class HeaderMultiLayoutPage(MultiLayoutPage[Header]): + response: HttpResponse + h1: H1Page + h2: H2Page - async def layout(self) -> ItemPage[Header]: - if self.response.css("h1::text"): - return self.h1 - return self.h2 + async def layout(self) -> ItemPage[Header]: + if self.response.css("h1::text"): + return self.h1 + return self.h2 .. note:: If you use :func:`~web_poet.handle_urls` both for your multi-layout page object class and for any of the page object classes that it diff --git a/tests/test_pages.py b/tests/test_pages.py index aec1bc55..76214ef8 100644 --- a/tests/test_pages.py +++ b/tests/test_pages.py @@ -40,13 +40,11 @@ async def test_multi_layout_page_object(): class Header: text: str - @attrs.define class H1Page(WebPage[Header]): @field def text(self) -> Optional[str]: return self.css("h1::text").get() - @attrs.define class H2Page(WebPage[Header]): @field def text(self) -> Optional[str]: diff --git a/web_poet/pages.py b/web_poet/pages.py index 21ff0e48..86f58978 100644 --- a/web_poet/pages.py +++ b/web_poet/pages.py @@ -77,7 +77,7 @@ class MultiLayoutPage(ItemPage[ItemT]): @abc.abstractmethod async def layout(self) -> ItemPage[ItemT]: """Return the :ref:`page object ` to use based on the - received input.""" + received input (e.g. :class:`~.HttpResponse`).""" async def to_item(self) -> ItemT: """Return the output of the :meth:`~web_poet.pages.ItemPage.to_item` From 51bf31f501eb75a2a3f1c45fdc4f6aa13f4b22e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 22 Nov 2022 08:47:21 +0100 Subject: [PATCH 11/13] Add a test case for multilayout exposing layouts that inherit from a common, partial layout --- tests/test_pages.py | 96 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/tests/test_pages.py b/tests/test_pages.py index 76214ef8..45e3a31d 100644 --- a/tests/test_pages.py +++ b/tests/test_pages.py @@ -98,6 +98,102 @@ async def layout(self) -> ItemPage[Header]: assert item2.text == "b" +@pytest.mark.asyncio +async def test_multi_layout_page_object_shared_partial_layout(): + """Scenario where a multi-layout page object acts as a switch for 2 or + more layout page objects that all inherit from some other page object class + that implements extraction for shared fields.""" + + @attrs.define + class PartialItem: + url: str + + @attrs.define + class FullItem(PartialItem): + text: str + + class PartialPage(WebPage[PartialItem]): + @field + async def url(self) -> str: + return str(self.response.url) + + class FullPage1(PartialPage, Returns[FullItem]): + @field + async def text(self) -> Optional[str]: + return self.css("h1::text").get() + + class FullPage2(PartialPage, Returns[FullItem]): + @field + async def text(self) -> Optional[str]: + return self.css("h2::text").get() + + @attrs.define + class MyMultiLayoutPage(MultiLayoutPage[FullItem]): + response: HttpResponse + page1: FullPage1 + page2: FullPage2 + + async def layout(self) -> ItemPage[FullItem]: + if self.response.css("h1::text"): + return self.page1 # type: ignore[return-value] + return self.page2 # type: ignore[return-value] + + html1 = b""" + + + + h1 + + +

a

+ + + """ + html2 = b""" + + + + h2 + + +

b

+ + + """ + + url = "https://example.com" + response1 = HttpResponse(url, body=html1) + page1_1 = FullPage1(response=response1) + page2_1 = FullPage2(response=response1) + response2 = HttpResponse(url, body=html2) + page1_2 = FullPage1(response=response2) + page2_2 = FullPage2(response=response2) + + multilayoutpage1 = MyMultiLayoutPage( + response=response1, page1=page1_1, page2=page2_1 + ) + multilayoutpage2 = MyMultiLayoutPage( + response=response2, page1=page1_2, page2=page2_2 + ) + + # To access page object fields, you must first get the underlying page + # object, and then access its fields: + layout1 = await multilayoutpage1.layout() + assert await layout1.url == url + assert await layout1.text == "a" + layout2 = await multilayoutpage2.layout() + assert await layout2.url == url + assert await layout2.text == "b" + + # Returned items work as expected. + item1 = await multilayoutpage1.to_item() + assert item1.url == url + assert item1.text == "a" + item2 = await multilayoutpage2.to_item() + assert item2.url == url + assert item2.text == "b" + + def test_web_page_object(book_list_html_response) -> None: class MyWebPage(WebPage): def to_item(self) -> dict: # type: ignore From 828a84b0a4f5ec837b59703a45bb03a65b3f8a0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 22 Nov 2022 09:03:33 +0100 Subject: [PATCH 12/13] =?UTF-8?q?MultiLayoutPage:=20layout=20=E2=86=92=20g?= =?UTF-8?q?et=5Flayout?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/api-reference.rst | 2 +- docs/page-objects/layouts.rst | 6 +++--- tests/test_pages.py | 8 ++++---- web_poet/pages.py | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/api-reference.rst b/docs/api-reference.rst index 9b928ea0..27d93fce 100644 --- a/docs/api-reference.rst +++ b/docs/api-reference.rst @@ -57,7 +57,7 @@ Pages .. autoclass:: MultiLayoutPage :show-inheritance: - :members: layout + :members: get_layout Mixins ====== diff --git a/docs/page-objects/layouts.rst b/docs/page-objects/layouts.rst index a6200384..48711207 100644 --- a/docs/page-objects/layouts.rst +++ b/docs/page-objects/layouts.rst @@ -185,8 +185,8 @@ Your multi-layout page object class should: You may use :class:`~web_poet.pages.MultiLayoutPage` as a base class for your multi-layout page object class, so you only need to implement the -:class:`~web_poet.pages.MultiLayoutPage.layout` method that determines which -page object to use. For example: +:class:`~web_poet.pages.MultiLayoutPage.get_layout` method that determines +which page object to use. For example: .. code-block:: python @@ -222,7 +222,7 @@ page object to use. For example: h1: H1Page h2: H2Page - async def layout(self) -> ItemPage[Header]: + async def get_layout(self) -> ItemPage[Header]: if self.response.css("h1::text"): return self.h1 return self.h2 diff --git a/tests/test_pages.py b/tests/test_pages.py index 45e3a31d..39e21752 100644 --- a/tests/test_pages.py +++ b/tests/test_pages.py @@ -56,7 +56,7 @@ class HeaderMultiLayoutPage(MultiLayoutPage[Header]): h1: H1Page h2: H2Page - async def layout(self) -> ItemPage[Header]: + async def get_layout(self) -> ItemPage[Header]: if self.response.css("h1::text"): return self.h1 return self.h2 @@ -133,7 +133,7 @@ class MyMultiLayoutPage(MultiLayoutPage[FullItem]): page1: FullPage1 page2: FullPage2 - async def layout(self) -> ItemPage[FullItem]: + async def get_layout(self) -> ItemPage[FullItem]: if self.response.css("h1::text"): return self.page1 # type: ignore[return-value] return self.page2 # type: ignore[return-value] @@ -178,10 +178,10 @@ async def layout(self) -> ItemPage[FullItem]: # To access page object fields, you must first get the underlying page # object, and then access its fields: - layout1 = await multilayoutpage1.layout() + layout1 = await multilayoutpage1.get_layout() assert await layout1.url == url assert await layout1.text == "a" - layout2 = await multilayoutpage2.layout() + layout2 = await multilayoutpage2.get_layout() assert await layout2.url == url assert await layout2.text == "b" diff --git a/web_poet/pages.py b/web_poet/pages.py index 86f58978..3687a469 100644 --- a/web_poet/pages.py +++ b/web_poet/pages.py @@ -75,7 +75,7 @@ class MultiLayoutPage(ItemPage[ItemT]): """ @abc.abstractmethod - async def layout(self) -> ItemPage[ItemT]: + async def get_layout(self) -> ItemPage[ItemT]: """Return the :ref:`page object ` to use based on the received input (e.g. :class:`~.HttpResponse`).""" @@ -83,7 +83,7 @@ async def to_item(self) -> ItemT: """Return the output of the :meth:`~web_poet.pages.ItemPage.to_item` method of the :ref:`page object ` that :meth:`layout` returns.""" - page_object = await self.layout() + page_object = await self.get_layout() return await page_object.to_item() From fc7867fb0d9a8c48cc43d3bf2ec295d09fbf724f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 26 Jan 2023 12:10:42 +0100 Subject: [PATCH 13/13] Provide __get_layout example as a test --- tests/test_multilayout.py | 116 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 tests/test_multilayout.py diff --git a/tests/test_multilayout.py b/tests/test_multilayout.py new file mode 100644 index 00000000..782e822c --- /dev/null +++ b/tests/test_multilayout.py @@ -0,0 +1,116 @@ +"""Proof of concept of an approach to multi-layout support that involves +documenting best practices on how to handle it with the existing API, rather +than providing a new API for it.""" + +import attrs +import pytest + +from web_poet import HttpResponse, ItemPage, field + + +@attrs.define +class Item: + title: str + text: str + + +@attrs.define +class Title: + title: str + + +@attrs.define +class Text: + text: str + + +@pytest.mark.asyncio +async def test_multiple_inheritance(): + + html = b""" + + + + foo + + bar + + """ + + @attrs.define + class TitleAPage(ItemPage[Title]): + response: HttpResponse + + @field + def title(self): + return self.response.css("title::text").get() + + @attrs.define + class TitleBPage(ItemPage[Title]): + response: HttpResponse + + @field + def title(self): + return self.response.css("h1::text").get() + + @attrs.define + class TitleMultiLayout(ItemPage[Item]): + response: HttpResponse + title_a: TitleAPage + title_b: TitleBPage + + # TODO: cache the result + def __get_layout(self): + if self.response.css("#a"): + return self.title_a + return self.title_b + + @field + def title(self): + return self.__get_layout().title + + @attrs.define + class TextAPage(ItemPage[Text]): + response: HttpResponse + + @field + def text(self): + return self.response.css("#a::text").get() + + @attrs.define + class TextBPage(ItemPage[Text]): + response: HttpResponse + + @field + def text(self): + return self.response.css("#b::text").get() + + @attrs.define + class TitleAndTextMultiLayout(TitleMultiLayout): + text_a: TextAPage + text_b: TextBPage + + # TODO: cache the result + def __get_layout(self): + if self.response.css("#a"): + return self.text_a + return self.text_b + + @field + def text(self): + return self.__get_layout().text + + response = HttpResponse("https://example.com", body=html, encoding="utf8") + title_a = TitleAPage(response=response) + title_b = TitleBPage(response=response) + text_a = TextAPage(response=response) + text_b = TextBPage(response=response) + layout = TitleAndTextMultiLayout( + response=response, + title_a=title_a, + title_b=title_b, + text_a=text_a, + text_b=text_b, + ) + + assert await layout.to_item() == Item(title="foo", text="bar")