Browse Source

Science category: update the engines

* use the paper.html template
* fetch more data from the engines
* add crossref.py
Alexandre FLAMENT 2 years ago
parent
commit
e36f85b836

+ 65 - 27
searx/engines/arxiv.py

@@ -3,9 +3,10 @@
  ArXiV (Scientific preprints)
  ArXiV (Scientific preprints)
 """
 """
 
 
-from lxml import html
+from lxml import etree
+from lxml.etree import XPath
 from datetime import datetime
 from datetime import datetime
-from searx.utils import eval_xpath_list, eval_xpath_getindex
+from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex
 
 
 # about
 # about
 about = {
 about = {
@@ -17,7 +18,7 @@ about = {
     "results": 'XML-RSS',
     "results": 'XML-RSS',
 }
 }
 
 
-categories = ['science']
+categories = ['science', 'scientific publications']
 paging = True
 paging = True
 
 
 base_url = (
 base_url = (
@@ -27,6 +28,23 @@ base_url = (
 # engine dependent config
 # engine dependent config
 number_of_results = 10
 number_of_results = 10
 
 
+# xpaths
+arxiv_namespaces = {
+    "atom": "http://www.w3.org/2005/Atom",
+    "arxiv": "http://arxiv.org/schemas/atom",
+}
+xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces)
+xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces)
+xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces)
+xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces)
+xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces)
+xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces)
+xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces)
+xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces)
+xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces)
+xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces)
+xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces)
+
 
 
 def request(query, params):
 def request(query, params):
     # basic search
     # basic search
@@ -41,30 +59,50 @@ def request(query, params):
 
 
 def response(resp):
 def response(resp):
     results = []
     results = []
-
-    dom = html.fromstring(resp.content)
-
-    for entry in eval_xpath_list(dom, '//entry'):
-        title = eval_xpath_getindex(entry, './/title', 0).text
-
-        url = eval_xpath_getindex(entry, './/id', 0).text
-
-        content_string = '{doi_content}{abstract_content}'
-
-        abstract = eval_xpath_getindex(entry, './/summary', 0).text
-
-        #  If a doi is available, add it to the snipppet
-        doi_element = eval_xpath_getindex(entry, './/link[@title="doi"]', 0, default=None)
-        doi_content = doi_element.text if doi_element is not None else ''
-        content = content_string.format(doi_content=doi_content, abstract_content=abstract)
-
-        if len(content) > 300:
-            content = content[0:300] + "..."
-        # TODO: center snippet on query term
-
-        publishedDate = datetime.strptime(eval_xpath_getindex(entry, './/published', 0).text, '%Y-%m-%dT%H:%M:%SZ')
-
-        res_dict = {'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content}
+    dom = etree.fromstring(resp.content)
+    for entry in eval_xpath_list(dom, xpath_entry):
+        title = eval_xpath_getindex(entry, xpath_title, 0).text
+
+        url = eval_xpath_getindex(entry, xpath_id, 0).text
+        abstract = eval_xpath_getindex(entry, xpath_summary, 0).text
+
+        authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)]
+
+        #  doi
+        doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None)
+        doi = None if doi_element is None else doi_element.text
+
+        # pdf
+        pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None)
+        pdf_url = None if pdf_element is None else pdf_element.attrib.get('href')
+
+        # journal
+        journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None)
+        journal = None if journal_element is None else journal_element.text
+
+        # tags
+        tag_elements = eval_xpath(entry, xpath_category)
+        tags = [str(tag) for tag in tag_elements]
+
+        # comments
+        comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None)
+        comments = None if comments_elements is None else comments_elements.text
+
+        publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ')
+
+        res_dict = {
+            'template': 'paper.html',
+            'url': url,
+            'title': title,
+            'publishedDate': publishedDate,
+            'content': abstract,
+            'doi': doi,
+            'authors': authors,
+            'journal': journal,
+            'tags': tags,
+            'comments': comments,
+            'pdf_url': pdf_url,
+        }
 
 
         results.append(res_dict)
         results.append(res_dict)
 
 

+ 59 - 0
searx/engines/crossref.py

@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""Semantic Scholar (Science)
+"""
+
+from urllib.parse import urlencode
+from searx.utils import html_to_text
+
+about = {
+    "website": 'https://www.crossref.org/',
+    "wikidata_id": 'Q5188229',
+    "official_api_documentation": 'https://github.com/CrossRef/rest-api-doc',
+    "use_official_api": False,
+    "require_api_key": False,
+    "results": 'JSON',
+}
+
+categories = ['science', 'scientific publications']
+paging = True
+search_url = 'https://api.crossref.org/works'
+
+
+def request(query, params):
+    params['url'] = search_url + '?' + urlencode(dict(query=query, offset=20 * (params['pageno'] - 1)))
+    return params
+
+
+def response(resp):
+    res = resp.json()
+    results = []
+    for record in res['message']['items']:
+        record_type = record['type']
+        if record_type == 'book-chapter':
+            title = record['container-title'][0]
+            if record['title'][0].lower().strip() != title.lower().strip():
+                title = title + ' (' + record['title'][0] + ')'
+            journal = None
+        else:
+            title = record['title'][0]
+            journal = record.get('container-title', [None])[0]
+        url = record.get('resource', {}).get('primary', {}).get('URL') or record['URL']
+        authors = [author.get('given', '') + ' ' + author.get('family', '') for author in record.get('author', [])]
+        isbn = record.get('isbn') or [i['value'] for i in record.get('isbn-type', [])]
+        results.append(
+            {
+                'template': 'paper.html',
+                'url': url,
+                'title': title,
+                'journal': journal,
+                'volume': record.get('volume'),
+                'type': record['type'],
+                'content': html_to_text(record.get('abstract', '')),
+                'publisher': record.get('publisher'),
+                'authors': authors,
+                'doi': record['DOI'],
+                'isbn': isbn,
+            }
+        )
+    return results

+ 73 - 12
searx/engines/google_scholar.py

@@ -13,10 +13,12 @@ Definitions`_.
 
 
 from urllib.parse import urlencode
 from urllib.parse import urlencode
 from datetime import datetime
 from datetime import datetime
+from typing import Optional
 from lxml import html
 from lxml import html
 
 
 from searx.utils import (
 from searx.utils import (
     eval_xpath,
     eval_xpath,
+    eval_xpath_getindex,
     eval_xpath_list,
     eval_xpath_list,
     extract_text,
     extract_text,
 )
 )
@@ -46,7 +48,7 @@ about = {
 }
 }
 
 
 # engine dependent config
 # engine dependent config
-categories = ['science']
+categories = ['science', 'scientific publications']
 paging = True
 paging = True
 language_support = True
 language_support = True
 use_locale_domain = True
 use_locale_domain = True
@@ -99,7 +101,43 @@ def request(query, params):
     return params
     return params
 
 
 
 
-def response(resp):
+def parse_gs_a(text: Optional[str]):
+    """Parse the text written in green.
+
+    Possible formats:
+    * "{authors} - {journal}, {year} - {publisher}"
+    * "{authors} - {year} - {publisher}"
+    * "{authors} - {publisher}"
+    """
+    if text is None or text == "":
+        return None, None, None, None
+
+    s_text = text.split(' - ')
+    authors = s_text[0].split(', ')
+    publisher = s_text[-1]
+    if len(s_text) != 3:
+        return authors, None, publisher, None
+
+    # the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}"
+    # get journal and year
+    journal_year = s_text[1].split(', ')
+    # journal is optional and may contains some coma
+    if len(journal_year) > 1:
+        journal = ', '.join(journal_year[0:-1])
+        if journal == '…':
+            journal = None
+    else:
+        journal = None
+    # year
+    year = journal_year[-1]
+    try:
+        publishedDate = datetime.strptime(year.strip(), '%Y')
+    except ValueError:
+        publishedDate = None
+    return authors, journal, publisher, publishedDate
+
+
+def response(resp):  # pylint: disable=too-many-locals
     """Get response from google's search request"""
     """Get response from google's search request"""
     results = []
     results = []
 
 
@@ -112,30 +150,53 @@ def response(resp):
     dom = html.fromstring(resp.text)
     dom = html.fromstring(resp.text)
 
 
     # parse results
     # parse results
-    for result in eval_xpath_list(dom, '//div[@class="gs_ri"]'):
+    for result in eval_xpath_list(dom, '//div[@data-cid]'):
 
 
-        title = extract_text(eval_xpath(result, './h3[1]//a'))
+        title = extract_text(eval_xpath(result, './/h3[1]//a'))
 
 
         if not title:
         if not title:
             # this is a [ZITATION] block
             # this is a [ZITATION] block
             continue
             continue
 
 
-        url = eval_xpath(result, './h3[1]//a/@href')[0]
-        content = extract_text(eval_xpath(result, './div[@class="gs_rs"]')) or ''
-
-        pub_info = extract_text(eval_xpath(result, './div[@class="gs_a"]'))
-        if pub_info:
-            content += "[%s]" % pub_info
-
         pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]'))
         pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]'))
         if pub_type:
         if pub_type:
-            title = title + " " + pub_type
+            pub_type = pub_type[1:-1].lower()
+
+        url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0)
+        content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]'))
+        authors, journal, publisher, publishedDate = parse_gs_a(
+            extract_text(eval_xpath(result, './/div[@class="gs_a"]'))
+        )
+        if publisher in url:
+            publisher = None
+
+        # cited by
+        comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]'))
+
+        # link to the html or pdf document
+        html_url = None
+        pdf_url = None
+        doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None)
+        doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
+        if doc_type == "[PDF]":
+            pdf_url = doc_url
+        else:
+            html_url = doc_url
 
 
         results.append(
         results.append(
             {
             {
+                'template': 'paper.html',
+                'type': pub_type,
                 'url': url,
                 'url': url,
                 'title': title,
                 'title': title,
+                'authors': authors,
+                'publisher': publisher,
+                'journal': journal,
+                'publishedDate': publishedDate,
                 'content': content,
                 'content': content,
+                'comments': comments,
+                'html_url': html_url,
+                'pdf_url': pdf_url,
             }
             }
         )
         )
 
 

+ 59 - 40
searx/engines/pubmed.py

@@ -3,11 +3,15 @@
  PubMed (Scholar publications)
  PubMed (Scholar publications)
 """
 """
 
 
-from flask_babel import gettext
 from lxml import etree
 from lxml import etree
 from datetime import datetime
 from datetime import datetime
 from urllib.parse import urlencode
 from urllib.parse import urlencode
 from searx.network import get
 from searx.network import get
+from searx.utils import (
+    eval_xpath_getindex,
+    eval_xpath_list,
+    extract_text,
+)
 
 
 # about
 # about
 about = {
 about = {
@@ -22,7 +26,7 @@ about = {
     "results": 'XML',
     "results": 'XML',
 }
 }
 
 
-categories = ['science']
+categories = ['science', 'scientific publications']
 
 
 base_url = (
 base_url = (
     'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'
     'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'
@@ -63,46 +67,61 @@ def response(resp):
 
 
     retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)
     retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)
 
 
-    search_results_xml = get(retrieve_url_encoded).content
-    search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation')
-
-    for entry in search_results:
-        title = entry.xpath('.//Article/ArticleTitle')[0].text
+    search_results_response = get(retrieve_url_encoded).content
+    search_results = etree.XML(search_results_response)
+    for entry in eval_xpath_list(search_results, '//PubmedArticle'):
+        medline = eval_xpath_getindex(entry, './MedlineCitation', 0)
 
 
-        pmid = entry.xpath('.//PMID')[0].text
+        title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text
+        pmid = eval_xpath_getindex(medline, './/PMID', 0).text
         url = pubmed_url + pmid
         url = pubmed_url + pmid
-
-        try:
-            content = entry.xpath('.//Abstract/AbstractText')[0].text
-        except:
-            content = gettext('No abstract is available for this publication.')
-
-        #  If a doi is available, add it to the snipppet
-        try:
-            doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text
-            content = 'DOI: {doi} Abstract: {content}'.format(doi=doi, content=content)
-        except:
-            pass
-
-        if len(content) > 300:
-            content = content[0:300] + "..."
-        # TODO: center snippet on query term
-
-        res_dict = {'url': url, 'title': title, 'content': content}
-
-        try:
-            publishedDate = datetime.strptime(
-                entry.xpath('.//DateCreated/Year')[0].text
-                + '-'
-                + entry.xpath('.//DateCreated/Month')[0].text
-                + '-'
-                + entry.xpath('.//DateCreated/Day')[0].text,
-                '%Y-%m-%d',
-            )
-            res_dict['publishedDate'] = publishedDate
-        except:
-            pass
+        content = extract_text(
+            eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True
+        )
+        doi = extract_text(
+            eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True
+        )
+        journal = extract_text(
+            eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True
+        )
+        issn = extract_text(
+            eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True
+        )
+        authors = []
+        for author in eval_xpath_list(medline, './Article/AuthorList/Author'):
+            f = eval_xpath_getindex(author, './ForeName', 0, default=None)
+            l = eval_xpath_getindex(author, './LastName', 0, default=None)
+            f = '' if f is None else f.text
+            l = '' if l is None else l.text
+            authors.append((f + ' ' + l).strip())
+
+        res_dict = {
+            'template': 'paper.html',
+            'url': url,
+            'title': title,
+            'content': content,
+            'journal': journal,
+            'issn': [issn],
+            'authors': authors,
+            'doi': doi,
+        }
+
+        accepted_date = eval_xpath_getindex(
+            entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None
+        )
+        if accepted_date is not None:
+            year = eval_xpath_getindex(accepted_date, './Year', 0)
+            month = eval_xpath_getindex(accepted_date, './Month', 0)
+            day = eval_xpath_getindex(accepted_date, './Day', 0)
+            try:
+                publishedDate = datetime.strptime(
+                    year.text + '-' + month.text + '-' + day.text,
+                    '%Y-%m-%d',
+                )
+                res_dict['publishedDate'] = publishedDate
+            except Exception as e:
+                print(e)
 
 
         results.append(res_dict)
         results.append(res_dict)
 
 
-        return results
+    return results

+ 41 - 16
searx/engines/semantic_scholar.py

@@ -6,6 +6,8 @@
 from json import dumps, loads
 from json import dumps, loads
 from datetime import datetime
 from datetime import datetime
 
 
+from flask_babel import gettext
+
 about = {
 about = {
     "website": 'https://www.semanticscholar.org/',
     "website": 'https://www.semanticscholar.org/',
     "wikidata_id": 'Q22908627',
     "wikidata_id": 'Q22908627',
@@ -15,6 +17,7 @@ about = {
     "results": 'JSON',
     "results": 'JSON',
 }
 }
 
 
+categories = ['science', 'scientific publications']
 paging = True
 paging = True
 search_url = 'https://www.semanticscholar.org/api/1/search'
 search_url = 'https://www.semanticscholar.org/api/1/search'
 paper_url = 'https://www.semanticscholar.org/paper'
 paper_url = 'https://www.semanticscholar.org/paper'
@@ -47,9 +50,6 @@ def response(resp):
     results = []
     results = []
 
 
     for result in res['results']:
     for result in res['results']:
-        item = {}
-        metadata = []
-
         url = result.get('primaryPaperLink', {}).get('url')
         url = result.get('primaryPaperLink', {}).get('url')
         if not url and result.get('links'):
         if not url and result.get('links'):
             url = result.get('links')[0]
             url = result.get('links')[0]
@@ -60,22 +60,47 @@ def response(resp):
         if not url:
         if not url:
             url = paper_url + '/%s' % result['id']
             url = paper_url + '/%s' % result['id']
 
 
-        item['url'] = url
+        # publishedDate
+        if 'pubDate' in result:
+            publishedDate = datetime.strptime(result['pubDate'], "%Y-%m-%d")
+        else:
+            publishedDate = None
 
 
-        item['title'] = result['title']['text']
-        item['content'] = result['paperAbstract']['text']
+        # authors
+        authors = [author[0]['name'] for author in result.get('authors', [])]
 
 
-        metadata = result.get('fieldsOfStudy') or []
-        venue = result.get('venue', {}).get('text')
-        if venue:
-            metadata.append(venue)
-        if metadata:
-            item['metadata'] = ', '.join(metadata)
+        # pick for the first alternate link, but not from the crawler
+        pdf_url = None
+        for doc in result.get('alternatePaperLinks', []):
+            if doc['linkType'] != 'crawler':
+                pdf_url = doc['url']
+                break
 
 
-        pubDate = result.get('pubDate')
-        if pubDate:
-            item['publishedDate'] = datetime.strptime(pubDate, "%Y-%m-%d")
+        # comments
+        comments = None
+        if 'citationStats' in result:
+            comments = gettext(
+                '{numCitations} citations from the year {firstCitationVelocityYear} to {lastCitationVelocityYear}'
+            ).format(
+                numCitations=result['citationStats']['numCitations'],
+                firstCitationVelocityYear=result['citationStats']['firstCitationVelocityYear'],
+                lastCitationVelocityYear=result['citationStats']['lastCitationVelocityYear'],
+            )
 
 
-        results.append(item)
+        results.append(
+            {
+                'template': 'paper.html',
+                'url': url,
+                'title': result['title']['text'],
+                'content': result['paperAbstract']['text'],
+                'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'),
+                'doi': result.get('doiInfo', {}).get('doi'),
+                'tags': result.get('fieldsOfStudy'),
+                'authors': authors,
+                'pdf_url': pdf_url,
+                'publishedDate': publishedDate,
+                'comments': comments,
+            }
+        )
 
 
     return results
     return results

+ 18 - 20
searx/engines/springer.py

@@ -19,7 +19,7 @@ about = {
     "results": 'JSON',
     "results": 'JSON',
 }
 }
 
 
-categories = ['science']
+categories = ['science', 'scientific publications']
 paging = True
 paging = True
 nb_per_page = 10
 nb_per_page = 10
 api_key = 'unset'
 api_key = 'unset'
@@ -41,32 +41,30 @@ def response(resp):
     json_data = loads(resp.text)
     json_data = loads(resp.text)
 
 
     for record in json_data['records']:
     for record in json_data['records']:
-        content = record['abstract'][0:500]
-        if len(record['abstract']) > len(content):
-            content += "..."
+        content = record['abstract']
         published = datetime.strptime(record['publicationDate'], '%Y-%m-%d')
         published = datetime.strptime(record['publicationDate'], '%Y-%m-%d')
-
-        metadata = [
-            record[x]
-            for x in [
-                'publicationName',
-                'identifier',
-                'contentType',
-            ]
-            if record.get(x) is not None
-        ]
-
-        metadata = ' / '.join(metadata)
-        if record.get('startingPage') and record.get('endingPage') is not None:
-            metadata += " (%(startingPage)s-%(endingPage)s)" % record
-
+        authors = [" ".join(author['creator'].split(', ')[::-1]) for author in record['creators']]
+        tags = record.get('genre')
+        if isinstance(tags, str):
+            tags = [tags]
         results.append(
         results.append(
             {
             {
+                'template': 'paper.html',
                 'title': record['title'],
                 'title': record['title'],
                 'url': record['url'][0]['value'].replace('http://', 'https://', 1),
                 'url': record['url'][0]['value'].replace('http://', 'https://', 1),
+                'type': record.get('contentType'),
                 'content': content,
                 'content': content,
                 'publishedDate': published,
                 'publishedDate': published,
-                'metadata': metadata,
+                'authors': authors,
+                'doi': record.get('doi'),
+                'journal': record.get('publicationName'),
+                'start_page': record.get('start_page'),
+                'end_page': record.get('end_page'),
+                'tags': tags,
+                'issn': [record.get('issn')],
+                'isbn': [record.get('isbn')],
+                'volume': record.get('volume') or None,
+                'number': record.get('number') or None,
             }
             }
         )
         )
     return results
     return results

+ 1 - 0
searx/searxng.msg

@@ -43,6 +43,7 @@ CATEGORY_GROUPS = {
     'REPOS': 'repos',
     'REPOS': 'repos',
     'SOFTWARE_WIKIS': 'software wikis',
     'SOFTWARE_WIKIS': 'software wikis',
     'WEB': 'web',
     'WEB': 'web',
+    'SCIENTIFIC PUBLICATIONS': 'scientific publications',
 }
 }
 
 
 STYLE_NAMES = {
 STYLE_NAMES = {

+ 4 - 22
searx/settings.yml

@@ -319,7 +319,6 @@ engines:
   - name: arxiv
   - name: arxiv
     engine: arxiv
     engine: arxiv
     shortcut: arx
     shortcut: arx
-    categories: science
     timeout: 4.0
     timeout: 4.0
 
 
   # tmp suspended:  dh key too small
   # tmp suspended:  dh key too small
@@ -411,23 +410,9 @@ engines:
   #   api_key: 'unset'
   #   api_key: 'unset'
 
 
   - name: crossref
   - name: crossref
-    engine: json_engine
-    paging: true
-    search_url: https://search.crossref.org/dois?q={query}&page={pageno}
-    url_query: doi
-    title_query: title
-    title_html_to_text: true
-    content_query: fullCitation
-    content_html_to_text: true
-    categories: science
+    engine: crossref
     shortcut: cr
     shortcut: cr
-    about:
-      website: https://www.crossref.org/
-      wikidata_id: Q5188229
-      official_api_documentation: https://github.com/CrossRef/rest-api-doc
-      use_official_api: false
-      require_api_key: false
-      results: JSON
+    timeout: 10
 
 
   - name: yep
   - name: yep
     engine: json_engine
     engine: json_engine
@@ -1068,7 +1053,7 @@ engines:
     title_query: metadata/oaf:entity/oaf:result/title/$
     title_query: metadata/oaf:entity/oaf:result/title/$
     content_query: metadata/oaf:entity/oaf:result/description/$
     content_query: metadata/oaf:entity/oaf:result/description/$
     content_html_to_text: true
     content_html_to_text: true
-    categories: science
+    categories: "science"
     shortcut: oad
     shortcut: oad
     timeout: 5.0
     timeout: 5.0
     about:
     about:
@@ -1198,7 +1183,6 @@ engines:
   - name: pubmed
   - name: pubmed
     engine: pubmed
     engine: pubmed
     shortcut: pub
     shortcut: pub
-    categories: science
     timeout: 3.0
     timeout: 3.0
 
 
   - name: pypi
   - name: pypi
@@ -1346,7 +1330,6 @@ engines:
     engine: semantic_scholar
     engine: semantic_scholar
     disabled: true
     disabled: true
     shortcut: se
     shortcut: se
-    categories: science
 
 
   # Spotify needs API credentials
   # Spotify needs API credentials
   # - name: spotify
   # - name: spotify
@@ -1372,8 +1355,7 @@ engines:
   #   # working API key, for test & debug: "a69685087d07eca9f13db62f65b8f601"
   #   # working API key, for test & debug: "a69685087d07eca9f13db62f65b8f601"
   #   api_key: 'unset'
   #   api_key: 'unset'
   #   shortcut: springer
   #   shortcut: springer
-  #   categories: science
-  #   timeout: 6.0
+  #   timeout: 15.0
 
 
   - name: startpage
   - name: startpage
     engine: startpage
     engine: startpage