|
@@ -142,9 +142,6 @@ search_url = base_url + '/sp/search'
|
|
|
# specific xpath variables
|
|
|
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
|
|
|
# not ads: div[@class="result"] are the direct childs of div[@id="results"]
|
|
|
-results_xpath = '//div[@class="w-gl__result__main"]'
|
|
|
-link_xpath = './/a[@class="w-gl__result-title result-link"]'
|
|
|
-content_xpath = './/p[@class="w-gl__description"]'
|
|
|
search_form_xpath = '//form[@id="search"]'
|
|
|
"""XPath of Startpage's origin search form
|
|
|
|
|
@@ -334,8 +331,8 @@ def _response_cat_web(dom):
|
|
|
results = []
|
|
|
|
|
|
# parse results
|
|
|
- for result in eval_xpath(dom, results_xpath):
|
|
|
- links = eval_xpath(result, link_xpath)
|
|
|
+ for result in eval_xpath(dom, '//div[@class="w-gl"]/div[contains(@class, "result")]'):
|
|
|
+ links = eval_xpath(result, './/a[contains(@class, "result-title result-link")]')
|
|
|
if not links:
|
|
|
continue
|
|
|
link = links[0]
|
|
@@ -349,12 +346,9 @@ def _response_cat_web(dom):
|
|
|
if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
|
|
|
continue
|
|
|
|
|
|
- title = extract_text(link)
|
|
|
-
|
|
|
- if eval_xpath(result, content_xpath):
|
|
|
- content: str = extract_text(eval_xpath(result, content_xpath)) # type: ignore
|
|
|
- else:
|
|
|
- content = ''
|
|
|
+ title = extract_text(eval_xpath(link, 'h2'))
|
|
|
+ content = eval_xpath(result, './/p[contains(@class, "description")]')
|
|
|
+ content = extract_text(content, allow_none=True) or ''
|
|
|
|
|
|
published_date = None
|
|
|
|