mini/app.py

"""Script to access News article via HTTP using News API https://newsapi.org.

This script access news article via HTTP using trawler_requests and retrieves
JSON response and store in a CSV File in Colossus. Multiple news sources
can be accessed which results in different CSV files being created.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import pandas as pd
import csv
import json
import re
import urllib

from datetime import datetime, timedelta
from absl import app
from absl import flags
from absl import logging

import requests

FLAGS = flags.FLAGS

flags.DEFINE_string(
    'api_key',
    None, 'API Key from newsapi.org')
flags.DEFINE_string(
    'date_from',
    '2018-03-12',
    'A date and optional time for the oldest article allowed. This should be in'
    ' ISO 8601 format.')
flags.DEFINE_string(
    'date_to',
    '2018-03-12',
    'A date and optional time for the newest article allowed. This should be in'
    ' ISO 8601 format.')
flags.DEFINE_string(
    'sort_by',
    'popularity',
    'The order to sort the articles in.')
flags.DEFINE_integer(
    'page_size',
    100,
    'The number of results to return per page.',
    lower_bound=1,
    upper_bound=100)
flags.DEFINE_string('output_folder', 'data', 'Results file.')

# News information.
_NEWS_SOURCES = ('techcrunch', 'bloomberg', 'the-washington-post', 'reuters',
                 'ars-technica', 'hacker-news', 'recode', 'business-insider',
                 'the-verge', 'techradar', 'wired')
_QUERY_TERMS = ('android startup', 'ios startup', 'mobile startup')
# News API URL v2.
_API_URL = 'https://newsapi.org/v2/'
_EVERYTHING = 'everything'

# Retry information for pyglib.retry.
_INITIAL_RETRY_INTERVAL_SECS = 1
_RETRY_TIMES = 3
# Wait this long for outgoing HTTP connections to be established.
_CONNECT_TIMEOUT_SECONDS = 90
# Wait this long to read from an HTTP socket.
_READ_TIMEOUT_SECONDS = 120
_FETCH_TIMEOUT_MS = _READ_TIMEOUT_SECONDS * 1000
# Wait this long for Trawler RPC to complete.
_REQUEST_DEADLINE_MS = 3 * 60 * 1000

_AUTHOR = 'author'
_TITLE = 'title'
_DESCRIPTION = 'description'
_URL = 'url'
_URLTOIMAGE = 'urlToImage'
_DEFAULT_LANGUAGE = 'en'
_DEFAULT_COUNTRY = 'us'
_PUBLISHED = 'publishedAt'
_POPULARITY = 'popularity'
_RELEVANCY = 'relevancy'
_SOURCE = 'source'
_SOURCE_NAME = 'name'
_STATUS = 'status'
_OK = 'ok'

DATE_REGEXES = [re.compile(r'^\d{4}-\d{2}-\d{2}$')]


class NewsApiInvalidContent(Exception):
    """Problem opening URL."""


class NewsApiAuth(requests.auth.AuthBase):
    """Handles HTTP authentication to https://newsapi.org.

     By using a key generated by News API: https://newsapi.org/docs/authentication
     we use this class to provide authentication when sending HTTP get requests.

    Attributes:
      api_key: (str) Corresponds to key to authenticate News Api.
    """

    def __init__(self, api_key):
        self.api_key = api_key

    def __call__(self, request):
        request.headers.update(_GetAuthHeaders(self.api_key))
        return request


def _GetAuthHeaders(api_key):
    """Builds up JSON object to authenticate via API.

    Args:
        api_key: (str) - A String defining API Key.

    Returns:
        A Dictionary with authentication information.
    """
    return {'Content-Type': 'Application/JSON', 'Authorization': api_key}


def _ToString(value):
    """Returns a string type based on value variable type.

    Since we handle multiple languages we need to return a string type to write
    in file human readable character.

    Args:
      value: (None, str or unicode)

    Returns:
      A str or None if no input.
    """

    if not value:
        logging.warning('Empty value')
        return None
    try:
        unicode = str
    except:
        pass
    if isinstance(value, unicode):
        return value.encode('utf-8')
    else:
        return str(value).encode('utf-8')


def _ToDatetimeObject(date_str):
    """Converts a string into datetime object.

    Args:
      date_str: (str) A date and optional time for the oldest article
          allowed. This should be in ISO 8601 format. (yyyy-mm-dd)
    Returns:
      datetime.datetime Object.

    Raises:
      ValueError: Invalid date format.
    """

    if not date_str:
        date_str = datetime.now().strftime('%Y-%m-%d')

    if not any(date_.match(date_str) for date_ in DATE_REGEXES):
        raise ValueError('Invalid date format %s' % date_str)
    return datetime.strptime(date_str, '%Y-%m-%d')


class NewsApiClient(object):
    """Defines new object to collect News APi information.

    Attributes:
      url: (str) HTTP URL to contact API.
      auth: (requests) Corresponds to key to authenticate News Api.
    """

    def __init__(self, api_key, api_url):
        """Constructor to define API key and URL.

        Args:
          api_key: (str) Corresponds to key to authenticate News Api.
          api_url: (str) URL to contact API.

        Raises:
          ValueError: Invalid URL.
        """

        if not api_url:
            raise ValueError('Invalid URL')
        self.url = api_url
        self.auth = NewsApiAuth(api_key=api_key)

    def SearchArticlesByKeyword(self, keyword, date_from, date_to, page_size):
        """

        :param keyword:
        :param date_from:
        :param date_to:
        :param page_size:
        :return:
        """
        if not keyword:
            raise ValueError('Invalid keyword')

        url_params = {
            'q': keyword,
            'language': _DEFAULT_LANGUAGE,
            'sortBy': _RELEVANCY,
            'from': date_from.strftime('%Y-%m-%d'),
            'pageSize': page_size
        }
        if date_to:
            url_params['to'] = date_to.strftime('%Y-%m-%d')

        url = '%s%s?%s' % (self.url, _EVERYTHING, urllib.urlencode(url_params))
        logging.info('Looking up: %s', url)
        session = requests.Session()
        try:
            response = session.get(
                url,
                auth=self.auth,
                timeout=(_CONNECT_TIMEOUT_SECONDS, _READ_TIMEOUT_SECONDS),
                allow_redirects=False,
                verify=True)  # Verify SSL Certificate.
            response.raise_for_status()
            if response.status_code != 200:
                logging.exception(response.status_code)
                return
        except requests.exceptions.HTTPError as err:
            if err.response.status_code == 404:
                logging.error('Page not found: %s', url)
                return
        return _FetchPageContent(response)

    def GetArticles(self, source, sort_by, date_from, date_to, page_size):
        """Builds News URL and contacts News API for article information.

        Connects to newsapi.org API and extracts JSON data for news, then handles
        JSON format to HandleResponse to extract information for each article.

        Example URL:
         https://newsapi.org/v2/everything?sources=techcrunch&pageSize=1

         https://newsapi.org/v2/everything?
           sources=techcrunch&
           from=2018-02-06&to=2018-02-06&
           sortBy=popularity&
           pageSize=10

        Args:
          source: (str) - A string defining which news source to use.
          sort_by: (str) - The order to sort the articles in.
            Possible options: relevancy, popularity, publishedAt.
            relevancy = articles more closely related to q come first.
            popularity = articles from popular sources and publishers come first.
            publishedAt = newest articles come first.
          date_from: (datetime.datetime) - A date and optional time for the oldest
            article allowed. This should be in ISO 8601 format (yyyy-mm-dd).
          date_to: (datetime.datetime) - A date and optional time for the newest
            article allowed. This should be in ISO 8601 format (yyyy-mm-dd).
          page_size: (int) - Page size. Default is 20. Max is 100.

        Returns:
          requests.models.Response.content (str) Articles response from API.

        Raises:
            ValueError: Invalid source.
            requests.exceptions.RequestException: for any HTTP request/response
            error,
            Specifically:
            - requests.exceptions.HTTPError: if the HTTP response was not 200.
            - requests.exceptions.Timeout: if the HTTP request times out.
            - requests.exceptions.ConnectionError: if the HTTP request fails due to
            a network problem or SSL problem.
        """

        if not source:
            raise ValueError('Invalid source')

        url_params = {
            'sources': source,
            'from': date_from.strftime('%Y-%m-%d'),
            'to': date_to.strftime('%Y-%m-%d'),
            'sortBy': sort_by,
            'pageSize': page_size
        }
        url = '%s%s?%s' % (self.url, _EVERYTHING, urllib.urlencode(url_params))
        logging.info('Looking up: %s', url)
        session = requests.Session()
        try:
            response = session.get(
                url,
                auth=self.auth,
                timeout=(_CONNECT_TIMEOUT_SECONDS, _READ_TIMEOUT_SECONDS),
                allow_redirects=False,
                verify=True)  # Verify SSL Certificate.
            response.raise_for_status()
            if response.status_code != 200:
                logging.exception(response.status_code)
                return
        except requests.exceptions.HTTPError as err:
            if err.response.status_code == 404:
                logging.error('Page not found: %s', url)
                return
        return _FetchPageContent(response)


def _FetchPageContent(response):
    """Use FetchProxy to fetch the content of a URL.

    Args:
        response: (requests.models.Response), content we fetched in get request.

    Returns:
        (response) Image data in bytes as str type.

    Raises:
        ValueError: Invalid HTTP response.
    """
    if not response:
        logging.exception('HTTP Response is None')
        return
    return response.content


def _HandleApiResponse(content):
    """This function handles HTTP response body in JSON format.

    Args:
      content: (str). API response information.
      A string in JSON format with the following fields:
       -status: str. Example: ok
       -source: str. Example: ars-technica
       -sortBy: str. Example: top
       -articles: list

    Returns:
      A list of lists. Articles response.

    Raises:
      ValueError: Invalid HTTP response.
    """

    if not content:
        raise ValueError('HTTP Response is None')

    content_json = json.loads(content)
    articles_processed = []
    status = content_json.get(_STATUS)
    if status == _OK:
        for article in content_json['articles']:
            logging.info('Article: %s Title: %s', article[_SOURCE][_SOURCE_NAME],
                         article[_TITLE])
            try:
                article_instance = [
                    _ToString(article[_SOURCE][_SOURCE_NAME]),
                    _ToString(article[_TITLE]).replace(',', ''),
                    _ToString(article[_DESCRIPTION]).replace(',', ''),
                    _ToString(article[_AUTHOR]), _ToString(article[_URL]), _ToString(article[_URLTOIMAGE]),
                    article[_PUBLISHED]
                ]
            except ValueError as e:
                logging.exception(e)
            # Remove new lines to be able to write in file one line per article.
            articles_processed.append([
                                          column.replace('\r', '').replace('\n', '')
                                          if column is not None else '' for column in article_instance
                                          ])
    else:
        logging.error('Request failed. Status: %s', status)
    logging.info('Processed: %d articles' % len(articles_processed))
    return articles_processed


def LoadDataSet(filename):
    """

    :param filename:
    :return:
    """
    return pd.read_csv('%s' % filename)


def SaveDataSet(results, filename):
    """Write results stored in list of lists into a file in CNS.

    Args:
      results: (list) list of lists Results with article information.
      filename: (str) Destination file.

    Raises:
      ValueError: Result list is empty.
      FileError: Unable to write filename.
    """

    if not results:
        raise ValueError('Result list is empty')

    with open(filename, 'w+') as csvfile:
        filewriter = csv.writer(csvfile)
        filewriter.writerows(results)
    logging.info('Articles stored: %d.', len(results))


def GetDates(start_date, end_date, granularity):
    """

    :param start_date:
    :param end_date:
    :param granularity:
    :return:
    """

    if granularity == 'daily':
        period = 1
    elif granularity == 'weekly':
        period = 7
    elif granularity == 'monthly':
        period = 30
    else:
        period = 1

    start_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.strptime(end_date, '%Y-%m-%d')
    date_periods = []
    while start_date <= end_date:
        date_periods.append(datetime.strftime(start_date, '%Y-%m-%d'))
        start_date = start_date + timedelta(days=period)
    return date_periods


def main(_):
    """Backfill script.

    :param _:
    :return:
    """
    logging.info('News collection started.')
    all_news = []
    date_from = _ToDatetimeObject('2018-01-01')
    news_api_client = NewsApiClient(FLAGS.api_key, _API_URL)
    for query_term in _QUERY_TERMS:
        api_output = news_api_client.SearchArticlesByKeyword(keyword=query_term,
                                                             date_from=date_from,
                                                             date_to=None,
                                                             page_size=FLAGS.page_size)
        news_results = _HandleApiResponse(api_output)
        if news_results:
            all_news.extend(news_results)
    output_file = '%s/%s.csv' % (FLAGS.output_folder, 'AppDiscovery')
    logging.info('Writing articles in file: %s' % output_file)
    SaveDataSet(all_news, output_file)
    logging.info('News collection completed.')


if __name__ == '__main__':
    app.run(main)