Browse Source

[mod] result.py: merge infobox URL and attributes when the same label or the same entity

entity are wikidata entity (like "Q42" for "Douglas Adams", see https://www.wikidata.org/wiki/Q42 )
Alexandre Flament 4 years ago
parent
commit
382fded665
1 changed files with 26 additions and 8 deletions
  1. 26 8
      searx/results.py

+ 26 - 8
searx/results.py

@@ -20,6 +20,18 @@ def result_content_len(content):
 
 
 def compare_urls(url_a, url_b):
+    """Lazy compare between two URL.
+    "www.example.com" and "example.com" are equals.
+    "www.example.com/path/" and "www.example.com/path" are equals.
+    "https://www.example.com/" and "http://www.example.com/" are equals.
+
+    Args:
+        url_a (ParseResult): first URL
+        url_b (ParseResult): second URL
+
+    Returns:
+        bool: True if url_a and url_b are equals
+    """
     # ignore www. in comparison
     if url_a.netloc.startswith('www.'):
         host_a = url_a.netloc.replace('www.', '', 1)
@@ -68,8 +80,10 @@ def merge_two_infoboxes(infobox1, infobox2):
         for url2 in infobox2.get('urls', []):
             unique_url = True
             parsed_url2 = urlparse(url2.get('url', ''))
+            entity_url2 = url2.get('entity')
             for url1 in urls1:
-                if compare_urls(urlparse(url1.get('url', '')), parsed_url2):
+                if (entity_url2 is not None and url1.get('entity') == entity_url2)\
+                   or compare_urls(urlparse(url1.get('url', '')), parsed_url2):
                     unique_url = False
                     break
             if unique_url:
@@ -86,18 +100,22 @@ def merge_two_infoboxes(infobox1, infobox2):
             infobox1['img_src'] = img2
 
     if 'attributes' in infobox2:
-        attributes1 = infobox1.get('attributes', None)
+        attributes1 = infobox1.get('attributes')
         if attributes1 is None:
-            attributes1 = []
-            infobox1['attributes'] = attributes1
+            infobox1['attributes'] = attributes1 = []
 
         attributeSet = set()
-        for attribute in infobox1.get('attributes', []):
-            if attribute.get('label', None) not in attributeSet:
-                attributeSet.add(attribute.get('label', None))
+        for attribute in attributes1:
+            label = attribute.get('label')
+            if label not in attributeSet:
+                attributeSet.add(label)
+            entity = attribute.get('entity')
+            if entity not in attributeSet:
+                attributeSet.add(entity)
 
         for attribute in infobox2.get('attributes', []):
-            if attribute.get('label', None) not in attributeSet:
+            if attribute.get('label') not in attributeSet\
+               and attribute.get('entity') not in attributeSet:
                 attributes1.append(attribute)
 
     if 'content' in infobox2: