Browse Source

Merge pull request #1030 from austinhuang0131/master

(feat) add jisho.org
Alexandre Flament 3 years ago
parent
commit
592cea0e5e
2 changed files with 142 additions and 0 deletions
  1. 136 0
      searx/engines/jisho.py
  2. 6 0
      searx/settings.yml

+ 136 - 0
searx/engines/jisho.py

@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""
+Jisho (the Japanese-English dictionary)
+"""
+
+from urllib.parse import urlencode, urljoin
+
+# about
+about = {
+    "website": 'https://jisho.org',
+    "wikidata_id": 'Q24568389',
+    "official_api_documentation": "https://jisho.org/forum/54fefc1f6e73340b1f160000-is-there-any-kind-of-search-api",
+    "use_official_api": True,
+    "require_api_key": False,
+    "results": 'JSON',
+    "language": 'ja',
+}
+
+categories = ['dictionaries']
+paging = False
+
+URL = 'https://jisho.org'
+BASE_URL = 'https://jisho.org/word/'
+SEARCH_URL = URL + '/api/v1/search/words?{query}'
+
+
+def request(query, params):
+    query = urlencode({'keyword': query})
+    params['url'] = SEARCH_URL.format(query=query)
+    logger.debug(f"query_url --> {params['url']}")
+    return params
+
+
+def response(resp):
+    results = []
+    first_result = True
+
+    search_results = resp.json()
+
+    for page in search_results.get('data', []):
+        # Entries that are purely from Wikipedia are excluded.
+        parts_of_speech = page.get('senses') and page['senses'][0].get('parts_of_speech')
+        if parts_of_speech and parts_of_speech[0] == 'Wikipedia definition':
+            pass
+
+        # Process alternative forms
+        alt_forms = []
+        for title_raw in page['japanese']:
+            if 'word' not in title_raw:
+                alt_forms.append(title_raw['reading'])
+            else:
+                title = title_raw['word']
+                if 'reading' in title_raw:
+                    title += ' (' + title_raw['reading'] + ')'
+                alt_forms.append(title)
+        
+        #
+        result_url = urljoin(BASE_URL, page['slug'])
+        definitions = get_definitions(page)
+
+        # For results, we'll return the URL, all alternative forms (as title),
+        # and all definitions (as description) truncated to 300 characters.
+        content = " ".join(f"{engdef}." for _, engdef, _ in definitions)
+        results.append({
+            'url': result_url,
+            'title': ", ".join(alt_forms),
+            'content': content[:300] + (content[300:] and '...')
+        })
+
+        # Like Wordnik, we'll return the first result in an infobox too.
+        if first_result:
+            first_result = False
+            results.append(get_infobox(alt_forms, result_url, definitions))
+
+    return results
+
+
+def get_definitions(page):
+    # Process definitions
+    definitions = []
+    for defn_raw in page['senses']:
+        extra = []
+        # Extra data. Since they're not documented, this implementation is based solely by the author's assumptions.
+        if defn_raw.get('tags'):
+            if defn_raw.get('info'):
+                # "usually written as kana: <kana>"
+                extra.append(defn_raw['tags'][0] + ', ' + defn_raw['info'][0] + '. ')
+            else:
+                # abbreviation, archaism, etc.
+                extra.append(', '.join(defn_raw['tags']) + '. ')
+        elif defn_raw.get('info'):
+            # inconsistent
+            extra.append(', '.join(defn_raw['info']).capitalize() + '. ')
+        if defn_raw.get('restrictions'):
+            extra.append('Only applies to: ' + ', '.join(defn_raw['restrictions']) + '. ')
+        definitions.append((
+            ', '.join(defn_raw['parts_of_speech']),
+            '; '.join(defn_raw['english_definitions']),
+            ''.join(extra)[:-1],
+        ))
+    return definitions
+
+
+def get_infobox(alt_forms, result_url, definitions):
+    infobox_content = []
+    # title & alt_forms
+    infobox_title = alt_forms[0]
+    if len(alt_forms) > 1:
+        infobox_content.append(f'<p><i>Other forms:</i> {", ".join(alt_forms[1:])}</p>')
+
+    # definitions
+    infobox_content.append('''
+        <small><a href="https://www.edrdg.org/wiki/index.php/JMdict-EDICT_Dictionary_Project">JMdict</a> 
+        and <a href="https://www.edrdg.org/enamdict/enamdict_doc.html">JMnedict</a> 
+        by <a href="https://www.edrdg.org/edrdg/licence.html">EDRDG</a>, CC BY-SA 3.0.</small>
+        <ul>
+    ''')
+    for pos, engdef, extra in definitions:
+        if pos == 'Wikipedia definition':
+            infobox_content.append('</ul><small>Wikipedia, CC BY-SA 3.0.</small><ul>')
+        pos = f'<i>{pos}</i>: ' if pos else ''
+        extra = f' ({extra})' if extra else ''
+        infobox_content.append(f'<li>{pos}{engdef}{extra}</li>')
+    infobox_content.append('</ul>')
+
+    #
+    return {
+        'infobox': infobox_title,
+        'content': ''.join(infobox_content),
+        'urls': [
+            {
+                'title': 'Jisho.org',
+                'url': result_url,
+            }
+        ]
+    }

+ 6 - 0
searx/settings.yml

@@ -798,6 +798,12 @@ engines:
     timeout: 3.0
     disabled: true
 
+  - name: jisho
+    engine: jisho
+    shortcut: js
+    timeout: 3.0
+    disabled: true
+
   - name: kickass
     engine: kickass
     shortcut: kc