Files
LMS-DB-ETL/src/transform.py
2025-11-06 16:32:04 -05:00

141 lines
5.5 KiB
Python

import os
import json
import logging
from datetime import date, datetime
from dotenv import load_dotenv
load_dotenv()
log_file = os.getenv('LOG_FILE')
logging.basicConfig(filename=log_file, level=logging.INFO)
logger = logging.getLogger('extract.py')
today = date.today()
def get_raw_json(path):
'''
Returns a dictionary read from a JSON file.
Keyword arguments:
path - the relative path for the JSON file to be read.
'''
with open(path, 'r') as json_file:
return json.loads(json_file.read())
def format_sort_title(title):
'''
Returns a string with any leading article moved to the end.
Keyword arguments:
title - the string to be formatted.
'''
if title.lower().startswith('the '):
return f'{title[4:]}, {title[0:3]}'
elif title.lower().startswith('a '):
return f'{title[2:]}, {title[0]}'
elif title.lower().startswith('an '):
return f'{title[3:]}, {title[0:2]}'
return title
def combine_raw_jsons(google_json, ol_json):
'''
Returns a dictionary consisting of an array of dictionaries.
Each child dictionary is a transformed book ready to be
inserted into a database.
Keyword arguments:
google_json - A dictionary consisting of raw data from the Google Books API
ol_json - A dictionary consisting of raw data from the OpenLibrary API
'''
transformed_dictionary = {'books': []}
for index in range(len(google_json['book_data'])):
transformed_dictionary_entry = {}
replace_quote = str.maketrans({"'": r"_"})
title = str(google_json['book_data'][index]['volumeInfo']['title']).translate(replace_quote)
author = ', '.join(ol_json['book_data'][index]['author_name']).translate(replace_quote)
isbn = ol_json['book_data'][index]['isbn']
sort_title = format_sort_title(title)
# Ensuring field variables have valid data
if 'categories' in google_json['book_data'][index]['volumeInfo']:
categories = ', '.join(google_json['book_data'][index]['volumeInfo']['categories'])
else:
categories = ''
if 'publisher' in google_json['book_data'][index]['volumeInfo']:
publisher = str(google_json['book_data'][index]['volumeInfo']['publisher']).translate(replace_quote)
else:
publisher = ''
if 'publishedDate' in google_json['book_data'][index]['volumeInfo']:
published_date = google_json['book_data'][index]['volumeInfo']['publishedDate']
# Making sure the publishing date has a month and day associated
if len(published_date) == 4:
published_date += '-12-31'
elif len(published_date) < 10:
published_date = published_date[0:3] + '-12-31'
else:
published_date = '9999-12-31'
if 'printType' in google_json['book_data'][index]['volumeInfo']:
print_type = google_json['book_data'][index]['volumeInfo']['printType']
else:
print_type = ''
if 'language' in google_json['book_data'][index]['volumeInfo']:
language = google_json['book_data'][index]['volumeInfo']['language']
else:
language = ''
if 'pageCount' in google_json['book_data'][index]['volumeInfo']:
pageCount = google_json['book_data'][index]['volumeInfo']['pageCount']
else:
pageCount = 0
loc_number = ol_json['book_data'][index]['loc_number']
dewey_decimal_number = ol_json['book_data'][index]['dewey_decimal_number']
description = ol_json['book_data'][index]['description']
price_in_cents = ol_json['book_data'][index]['price_in_cents']
cover_image_uri = ol_json['book_data'][index]['cover_image_uri']
lost_date = '9999-12-31'
transformed_dictionary_entry = {
'title': title,
'author': author,
'publisher': publisher,
'publishing_date': published_date,
'isbn': isbn,
'sort_title': sort_title,
'format': print_type,
'language': language,
'categories': categories,
'page_count': pageCount,
'is_checked_in': True,
'is_archived': False,
'is_lost': False,
'lost_date': lost_date,
'loc_number': loc_number,
'dewey_decimal_number': dewey_decimal_number,
'description': description,
'price_in_cents': price_in_cents,
'cover_image_uri': cover_image_uri,
}
transformed_dictionary['books'].append(transformed_dictionary_entry)
return transformed_dictionary
def start():
google_json = get_raw_json(f'output/raw_google_books_{today}.json')
ol_json = get_raw_json(f'output/raw_open_lib_books_{today}.json')
with open(f'output/transformed_{today}.json', 'w') as transformed:
transformed.write(json.dumps(combine_raw_jsons(google_json, ol_json)))
if __name__ == '__main__':
print('Transformation Started')
logger.info(f'{datetime.now()}:Transformation Started')
start()
print('Transformation Done')
logger.info(f'{datetime.now()}:Transformation Done')