Source code for googlesearch

#!/usr/bin/env python

# Copyright (c) 2009-2020, Mario Vilas
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#     * Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice,this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * Neither the name of the copyright holder nor the names of its
#       contributors may be used to endorse or promote products derived from
#       this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

import os
import random
import sys
import time
import ssl

if sys.version_info[0] > 2:
    from http.cookiejar import LWPCookieJar
    from urllib.request import Request, urlopen
    from urllib.parse import quote_plus, urlparse, parse_qs
else:
    from cookielib import LWPCookieJar
    from urllib import quote_plus
    from urllib2 import Request, urlopen
    from urlparse import urlparse, parse_qs

try:
    from bs4 import BeautifulSoup
    is_bs4 = True
except ImportError:
    from BeautifulSoup import BeautifulSoup
    is_bs4 = False

__all__ = [

    # Main search function.
    'search',

    # Shortcut for "get lucky" search.
    'lucky',

    # Miscellaneous utility functions.
    'get_random_user_agent', 'get_tbs',
]

# URL templates to make Google searches.
url_home = "https://www.google.%(tld)s/"
url_search = "https://www.google.%(tld)s/search?lr=lang_%(lang)s&" \
             "q=%(query)s&btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
             "cr=%(country)s&filter=0"
url_next_page = "https://www.google.%(tld)s/search?lr=lang_%(lang)s&" \
                "q=%(query)s&start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&" \
                "cr=%(country)s&filter=0"
url_search_num = "https://www.google.%(tld)s/search?lr=lang_%(lang)s&" \
                 "q=%(query)s&num=%(num)d&btnG=Google+Search&tbs=%(tbs)s&" \
                 "&safe=%(safe)scr=%(country)s&filter=0"
url_next_page_num = "https://www.google.%(tld)s/search?lr=lang_%(lang)s&" \
                    "q=%(query)s&num=%(num)d&start=%(start)d&tbs=%(tbs)s&" \
                    "safe=%(safe)s&cr=%(country)s&filter=0"
url_parameters = (
    'hl', 'q', 'num', 'btnG', 'start', 'tbs', 'safe', 'cr', 'filter')

# Cookie jar. Stored at the user's home folder.
# If the cookie jar is inaccessible, the errors are ignored.
home_folder = os.getenv('HOME')
if not home_folder:
    home_folder = os.getenv('USERHOME')
    if not home_folder:
        home_folder = '.'   # Use the current folder on error.
cookie_jar = LWPCookieJar(os.path.join(home_folder, '.google-cookie'))
try:
    cookie_jar.load()
except Exception:
    pass

# Default user agent, unless instructed by the user to change it.
USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'

# Load the list of valid user agents from the install folder.
# The search order is:
#   * user_agents.txt.gz
#   * user_agents.txt
#   * default user agent
try:
    install_folder = os.path.abspath(os.path.split(__file__)[0])
    try:
        user_agents_file = os.path.join(install_folder, 'user_agents.txt.gz')
        import gzip
        fp = gzip.open(user_agents_file, 'rb')
        try:
            user_agents_list = [_.strip() for _ in fp.readlines()]
        finally:
            fp.close()
            del fp
    except Exception:
        user_agents_file = os.path.join(install_folder, 'user_agents.txt')
        with open(user_agents_file) as fp:
            user_agents_list = [_.strip() for _ in fp.readlines()]
except Exception:
    user_agents_list = [USER_AGENT]


# Get a random user agent.
[docs] def get_random_user_agent(): """ Get a random user agent string. :rtype: str :return: Random user agent string. """ return random.choice(user_agents_list)
# Helper function to format the tbs parameter.
[docs] def get_tbs(from_date, to_date): """ Helper function to format the tbs parameter. :param datetime.date from_date: Python date object. :param datetime.date to_date: Python date object. :rtype: str :return: Dates encoded in tbs format. """ from_date = from_date.strftime('%m/%d/%Y') to_date = to_date.strftime('%m/%d/%Y') return 'cdr:1,cd_min:%(from_date)s,cd_max:%(to_date)s' % vars()
# Request the given URL and return the response page, using the cookie jar. # If the cookie jar is inaccessible, the errors are ignored. def get_page(url, user_agent=None, verify_ssl=True): """ Request the given URL and return the response page, using the cookie jar. :param str url: URL to retrieve. :param str user_agent: User agent for the HTTP requests. Use None for the default. :param bool verify_ssl: Verify the SSL certificate to prevent traffic interception attacks. Defaults to True. :rtype: str :return: Web page retrieved for the given URL. :raises IOError: An exception is raised on error. :raises urllib2.URLError: An exception is raised on error. :raises urllib2.HTTPError: An exception is raised on error. """ if user_agent is None: user_agent = USER_AGENT request = Request(url) request.add_header('User-Agent', user_agent) cookie_jar.add_cookie_header(request) if verify_ssl: response = urlopen(request) else: context = ssl._create_unverified_context() response = urlopen(request, context=context) cookie_jar.extract_cookies(response, request) html = response.read() response.close() try: cookie_jar.save() except Exception: pass return html # Filter links found in the Google result pages HTML code. # Returns None if the link doesn't yield a valid result. def filter_result(link): try: # Decode hidden URLs. if link.startswith('/url?'): o = urlparse(link, 'http') link = parse_qs(o.query)['q'][0] # Valid results are absolute URLs not pointing to a Google domain, # like images.google.com or googleusercontent.com for example. # TODO this could be improved! o = urlparse(link, 'http') if o.netloc and 'google' not in o.netloc: return link # On error, return None. except Exception: pass # Returns a generator that yields URLs. # Shortcut to single-item search. # Evaluates the iterator to return the single URL as a string.
[docs] def lucky(*args, **kwargs): """ Shortcut to single-item search. Same arguments as the main search function, but the return value changes. :rtype: str :return: URL found by Google. """ return next(search(*args, **kwargs))