LMS-DB-ETL/src/transform.py

import os
import json
import logging
from datetime import date, datetime
from dotenv import load_dotenv

load_dotenv()

log_file = os.getenv('LOG_FILE')
logging.basicConfig(filename=log_file, level=logging.INFO)
logger = logging.getLogger('extract.py')

today = date.today()

def get_raw_json(path):
    '''
        Returns a dictionary read from a JSON file.

        Keyword arguments:
        path - the relative path for the JSON file to be read.
    '''
    with open(path, 'r') as json_file:
        return json.loads(json_file.read())

def format_sort_title(title):
    '''
        Returns a string with any leading article moved to the end.

        Keyword arguments:
        title - the string to be formatted.
    '''
    if title.lower().startswith('the '):
        return f'{title[4:]}, {title[0:3]}'
    elif title.lower().startswith('a '):
        return f'{title[2:]}, {title[0]}'
    elif title.lower().startswith('an '):
        return f'{title[3:]}, {title[0:2]}'
    return title

def combine_raw_jsons(google_json, ol_json):
    '''
        Returns a dictionary consisting of an array of dictionaries.
        Each child dictionary is a transformed book ready to be
        inserted into a database.

        Keyword arguments:
        google_json - A dictionary consisting of raw data from the Google Books API
        ol_json - A dictionary consisting of raw data from the OpenLibrary API
    '''
    transformed_dictionary = {'books': []}
    for index in range(len(google_json['book_data'])):
        transformed_dictionary_entry = {}
        replace_quote = str.maketrans({"'": r"_"})

        title = str(google_json['book_data'][index]['volumeInfo']['title']).translate(replace_quote)
        author = ', '.join(ol_json['book_data'][index]['author_name']).translate(replace_quote)
        isbn = ol_json['book_data'][index]['isbn']
        sort_title = format_sort_title(title)

        # Ensuring field variables have valid data
        if 'categories' in google_json['book_data'][index]['volumeInfo']:
            categories = ', '.join(google_json['book_data'][index]['volumeInfo']['categories'])
        else:
            categories = ''

        if 'publisher' in google_json['book_data'][index]['volumeInfo']:
            publisher = str(google_json['book_data'][index]['volumeInfo']['publisher']).translate(replace_quote)
        else:
            publisher = ''

        if 'publishedDate' in google_json['book_data'][index]['volumeInfo']:
            published_date = google_json['book_data'][index]['volumeInfo']['publishedDate']
            # Making sure the publishing date has a month and day associated
            if len(published_date) == 4:
                published_date += '-12-31'
            elif len(published_date) < 10:
                published_date = published_date[0:3] + '-12-31'
        else:
            published_date = '9999-12-31'

        if 'printType' in google_json['book_data'][index]['volumeInfo']:
            print_type = google_json['book_data'][index]['volumeInfo']['printType']
        else:
            print_type = ''

        if 'language' in google_json['book_data'][index]['volumeInfo']:
            language = google_json['book_data'][index]['volumeInfo']['language']
        else:
            language = ''

        if 'pageCount' in google_json['book_data'][index]['volumeInfo']:
            pageCount = google_json['book_data'][index]['volumeInfo']['pageCount']
        else:
            pageCount = 0

        loc_number = ol_json['book_data'][index]['loc_number']
        dewey_decimal_number = ol_json['book_data'][index]['dewey_decimal_number']
        description = ol_json['book_data'][index]['description']
        price_in_cents = ol_json['book_data'][index]['price_in_cents']
        cover_image_uri = ol_json['book_data'][index]['cover_image_uri']
        lost_date = '9999-12-31'

        transformed_dictionary_entry = {
            'title':                    title,
            'author':                   author,
            'publisher':                publisher,
            'publishing_date':          published_date,
            'isbn':                     isbn,
            'sort_title':               sort_title,
            'format':                   print_type,
            'language':                 language,
            'categories':               categories,
            'page_count':               pageCount,
            'is_checked_in':            True,
            'is_archived':              False,
            'is_lost':                  False,
            'lost_date':                lost_date,
            'loc_number':               loc_number,
            'dewey_decimal_number':     dewey_decimal_number,
            'description':              description,
            'price_in_cents':           price_in_cents,
            'cover_image_uri':          cover_image_uri,
        }

        transformed_dictionary['books'].append(transformed_dictionary_entry)

    return transformed_dictionary

def start():
    google_json = get_raw_json(f'output/raw_google_books_{today}.json')
    ol_json = get_raw_json(f'output/raw_open_lib_books_{today}.json')
    with open(f'output/transformed_{today}.json', 'w') as transformed:
        transformed.write(json.dumps(combine_raw_jsons(google_json, ol_json)))

if __name__ == '__main__':
    print('Transformation Started')
    logger.info(f'{datetime.now()}:Transformation Started')
    start()
    print('Transformation Done')
    logger.info(f'{datetime.now()}:Transformation Done')