| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121 | #!/usr/bin/env python""" BASE (Scholar publications) @website     https://base-search.net @provide-api yes with authorization (https://api.base-search.net/) @using-api   yes @results     XML @stable      ? @parse       url, title, publishedDate, content More info on api: http://base-search.net/about/download/base_interface.pdf"""from lxml import etreefrom urllib import urlencodefrom searx.utils import searx_useragentfrom datetime import datetimeimport recategories = ['science']base_url = 'https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi'\           + '?func=PerformSearch&{query}&boost=oa&hits={hits}&offset={offset}'# engine dependent configpaging = Truenumber_of_results = 10# shortcuts for advanced searchshorcut_dict = {    # user-friendly keywords    'format:': 'dcformat:',    'author:': 'dccreator:',    'collection:': 'dccollection:',    'hdate:': 'dchdate:',    'contributor:': 'dccontributor:',    'coverage:': 'dccoverage:',    'date:': 'dcdate:',    'abstract:': 'dcdescription:',    'urls:': 'dcidentifier:',    'language:': 'dclanguage:',    'publisher:': 'dcpublisher:',    'relation:': 'dcrelation:',    'rights:': 'dcrights:',    'source:': 'dcsource:',    'subject:': 'dcsubject:',    'title:': 'dctitle:',    'type:': 'dcdctype:'}def request(query, params):    # replace shortcuts with API advanced search keywords    for key in shorcut_dict.keys():        query = re.sub(str(key), str(shorcut_dict[key]), query)    # basic search    offset = (params['pageno'] - 1) * number_of_results    string_args = dict(query=urlencode({'query': query}),                       offset=offset,                       hits=number_of_results)    params['url'] = base_url.format(**string_args)    params['headers']['User-Agent'] = searx_useragent()    return paramsdef response(resp):    results = []    search_results = etree.XML(resp.content)    for entry in search_results.xpath('./result/doc'):        content = "No description available"        date = datetime.now()  # needed in case no dcdate is available for an item        for item in entry:            if item.attrib["name"] == "dchdate":                harvestDate = item.text            elif item.attrib["name"] == "dcdate":                date = item.text            elif item.attrib["name"] == "dctitle":                title = item.text            elif item.attrib["name"] == "dclink":                url = item.text            elif item.attrib["name"] == "dcdescription":                content = item.text[:300]                if len(item.text) > 300:                    content += "..."# dates returned by the BASE API are not several formats        publishedDate = None        for date_format in ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d', '%Y-%m', '%Y']:            try:                publishedDate = datetime.strptime(date, date_format)                break            except:                pass        if publishedDate is not None:            res_dict = {'url': url,                        'title': title,                        'publishedDate': publishedDate,                        'content': content}        else:            res_dict = {'url': url,                        'title': title,                        'content': content}        results.append(res_dict)    return results
 |