Browse Source

[mod] utils.py: add markdown_to_text helper function

Bnyro 1 year ago
parent
commit
a3d7e9c285
2 changed files with 30 additions and 12 deletions
  1. 6 12
      searx/engines/lemmy.py
  2. 24 0
      searx/utils.py

+ 6 - 12
searx/engines/lemmy.py

@@ -42,10 +42,9 @@ Implementations
 from datetime import datetime
 from datetime import datetime
 from urllib.parse import urlencode
 from urllib.parse import urlencode
 
 
-from markdown_it import MarkdownIt
 from flask_babel import gettext
 from flask_babel import gettext
 
 
-from searx.utils import html_to_text
+from searx.utils import markdown_to_text
 
 
 about = {
 about = {
     "website": 'https://lemmy.ml/',
     "website": 'https://lemmy.ml/',
@@ -78,11 +77,6 @@ def request(query, params):
     return params
     return params
 
 
 
 
-def _format_content(content):
-    html = MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(content)
-    return html_to_text(html)
-
-
 def _get_communities(json):
 def _get_communities(json):
     results = []
     results = []
 
 
@@ -97,7 +91,7 @@ def _get_communities(json):
             {
             {
                 'url': result['community']['actor_id'],
                 'url': result['community']['actor_id'],
                 'title': result['community']['title'],
                 'title': result['community']['title'],
-                'content': _format_content(result['community'].get('description', '')),
+                'content': markdown_to_text(result['community'].get('description', '')),
                 'img_src': result['community'].get('icon', result['community'].get('banner')),
                 'img_src': result['community'].get('icon', result['community'].get('banner')),
                 'publishedDate': datetime.strptime(counts['published'][:19], '%Y-%m-%dT%H:%M:%S'),
                 'publishedDate': datetime.strptime(counts['published'][:19], '%Y-%m-%dT%H:%M:%S'),
                 'metadata': metadata,
                 'metadata': metadata,
@@ -114,7 +108,7 @@ def _get_users(json):
             {
             {
                 'url': result['person']['actor_id'],
                 'url': result['person']['actor_id'],
                 'title': result['person']['name'],
                 'title': result['person']['name'],
-                'content': _format_content(result['person'].get('bio', '')),
+                'content': markdown_to_text(result['person'].get('bio', '')),
             }
             }
         )
         )
 
 
@@ -140,7 +134,7 @@ def _get_posts(json):
 
 
         content = result['post'].get('body', '').strip()
         content = result['post'].get('body', '').strip()
         if content:
         if content:
-            content = _format_content(content)
+            content = markdown_to_text(content)
 
 
         results.append(
         results.append(
             {
             {
@@ -164,7 +158,7 @@ def _get_comments(json):
 
 
         content = result['comment'].get('content', '').strip()
         content = result['comment'].get('content', '').strip()
         if content:
         if content:
-            content = _format_content(content)
+            content = markdown_to_text(content)
 
 
         metadata = (
         metadata = (
             f"▲ {result['counts']['upvotes']} ▼ {result['counts']['downvotes']}"
             f"▲ {result['counts']['upvotes']} ▼ {result['counts']['downvotes']}"
@@ -176,7 +170,7 @@ def _get_comments(json):
             {
             {
                 'url': result['comment']['ap_id'],
                 'url': result['comment']['ap_id'],
                 'title': result['post']['name'],
                 'title': result['post']['name'],
-                'content': _format_content(result['comment']['content']),
+                'content': markdown_to_text(result['comment']['content']),
                 'publishedDate': datetime.strptime(result['comment']['published'][:19], '%Y-%m-%dT%H:%M:%S'),
                 'publishedDate': datetime.strptime(result['comment']['published'][:19], '%Y-%m-%dT%H:%M:%S'),
                 'metadata': metadata,
                 'metadata': metadata,
             }
             }

+ 24 - 0
searx/utils.py

@@ -15,6 +15,7 @@ from os.path import splitext, join
 from random import choice
 from random import choice
 from html.parser import HTMLParser
 from html.parser import HTMLParser
 from urllib.parse import urljoin, urlparse
 from urllib.parse import urljoin, urlparse
+from markdown_it import MarkdownIt
 
 
 from lxml import html
 from lxml import html
 from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
 from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
@@ -158,6 +159,29 @@ def html_to_text(html_str: str) -> str:
     return s.get_text()
     return s.get_text()
 
 
 
 
+def markdown_to_text(markdown_str: str) -> str:
+    """Extract text from a Markdown string
+
+    Args:
+        * markdown_str (str): string Markdown
+
+    Returns:
+        * str: extracted text
+
+    Examples:
+        >>> markdown_to_text('[example](https://example.com)')
+        'example'
+
+        >>> markdown_to_text('## Headline')
+        'Headline'
+    """
+
+    html_str = (
+        MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str)
+    )
+    return html_to_text(html_str)
+
+
 def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]:
 def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]:
     """Extract text from a lxml result
     """Extract text from a lxml result