''' Search ArXiv via its search API. Author: Kristjan Kannike Created: 2007-11-16 Modified: 2007-11-22 The ArXiv Search API is described in . The results can be conveniently parsed with the Feedparser . ''' # TODO: A simplified query syntax ('electron AND positron' pro # 'all:electron+AND+all:positron', etc.). import urllib import time wait_time = 3 # From the manual def single_query(search_query='', id_list='', start=0, max_res=10): '''A single query of the ArXiv search API. Return an Atom string. (Note that the results are sorted in a way peculiar to the API.)''' template = 'http://export.arxiv.org/api/query?' \ 'search_query=%s&id_list=%s&start=%d&max_results=%d' query_str = template % (search_query, id_list, start, max_res) return urllib.urlopen(query_str).read() def arxiv_query(search_query='', id_list='', start=0, total_res=None, \ max_res=100): '''Query the ArXiv search API. Return a list of Atom strings. (Note that the results are sorted in a way peculiar to the API.) If the number of results total_res is not specified, get all results in chunks of max_res.''' queries = [] if total_res is not None: if max_res > total_res: return [single_query(search_query, id_list, start, total_res)] else: for i in range(total_res/max_res): # NB! Integer division query = single_query(search_query, id_list, start, max_res) queries.append(query) start += max_res if query.count('') < max_res: return queries time.sleep(wait_time) queries.append(single_query(search_query, id_list, start, \ total_res%max_res)) return queries else: while True: query = single_query(search_query, id_list, start, max_res) queries.append(query) start += max_res if query.count('') < max_res: return queries time.sleep(wait_time)