From 7f517c647fc14be7503c78fb6b1c628b6497c9e3 Mon Sep 17 00:00:00 2001 From: Nicholas Kalar Date: Thu, 19 Jun 2025 16:46:29 -0400 Subject: [PATCH] Major overhaul --- src/transform.py | 116 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/src/transform.py b/src/transform.py index e69de29..1401c3e 100644 --- a/src/transform.py +++ b/src/transform.py @@ -0,0 +1,116 @@ +import os +import json +import logging +from datetime import date, datetime + +logger = logging.getLogger('transform.py') +logging.basicConfig(filename=os.getenv('LOG_FILE'), level=os.getenv('LOGGING_LEVEL')) + +today = date.today() + +def get_raw_json(path): + ''' + Returns a dictionary read from a JSON file. + + Keyword arguments: + path - the relative path for the JSON file to be read. + ''' + with open(path, 'r') as json_file: + return json.loads(json_file.read()) + +def format_sort_title(title): + ''' + Returns a string with any leading article moved to the end. + + Keyword arguments: + title - the string to be formatted. + ''' + if title.lower().startswith('the '): + return f'{title[4:]}, {title[0:3]}' + elif title.lower().startswith('a '): + return f'{title[2:]}, {title[0]}' + elif title.lower().startswith('an '): + return f'{title[3:]}, {title[0:2]}' + return title + +def combine_raw_jsons(google_json, ol_json): + ''' + Returns a dictionary consisting of an array of dictionarys. + Each child dictionary is a transformed book ready to be + inserted into a database. + + Keyword arguments: + google_json - A dictionary consisting of raw data from the Google Books API + ol_json - A dictionary consisting of raw data from the OpenLibrary API + ''' + transformed_dictionary = {'books': []} + for index in range(len(google_json['book_data'])): + transformed_dictionary_entry = {} + replace_quote = str.maketrans({"'": r"_"}) + + title = str(ol_json['book_data'][index]['title']).translate(replace_quote) + author = ', '.join(ol_json['book_data'][index]['author_name']).translate(replace_quote) + isbn = ol_json['book_data'][index]['isbn'] + sort_title = format_sort_title(title) + + if 'categories' in google_json['book_data'][index]['volumeInfo']: + categories = ', '.join(google_json['book_data'][index]['volumeInfo']['categories']) + else: + categories = None + + if 'publisher' in google_json['book_data'][index]['volumeInfo']: + publisher = str(google_json['book_data'][index]['volumeInfo']['publisher']).translate(replace_quote) + else: + publisher = None + + if 'publishedDate' in google_json['book_data'][index]['volumeInfo']: + published_date = google_json['book_data'][index]['volumeInfo']['publishedDate'] + else: + published_date = None + + if 'printType' in google_json['book_data'][index]['volumeInfo']: + print_type = google_json['book_data'][index]['volumeInfo']['printType'] + else: + print_type = None + + if 'language' in google_json['book_data'][index]['volumeInfo']: + language = google_json['book_data'][index]['volumeInfo']['language'] + else: + language = None + + if 'pageCount' in google_json['book_data'][index]['volumeInfo']: + pageCount = google_json['book_data'][index]['volumeInfo']['pageCount'] + else: + pageCount = 0 + + transformed_dictionary_entry = { + 'title': title, + 'author': author, + 'publisher': publisher, + 'publishing_date': published_date, + 'isbn': isbn, + 'sort_title': sort_title, + 'format': print_type, + 'language': language, + 'categories': categories, + 'page_count': pageCount, + 'is_checked_in': True, + 'is_archived': False, + 'is_lost': False, + } + transformed_dictionary['books'].append(transformed_dictionary_entry) + + return transformed_dictionary + +def start(): + google_json = get_raw_json(f'output/raw_google_books_{today}.json') + ol_json = get_raw_json(f'output/raw_open_lib_books_{today}.json') + with open(f'output/transformed_{today}.json', 'w') as transformed: + transformed.write(json.dumps(combine_raw_jsons(google_json, ol_json))) + +if __name__ == '__main__': + print('Transformation Started') + logger.info(f'{datetime.now()}:Transformation Started') + start() + print('Transformation Done') + logger.info(f'{datetime.now()}:Transformation Done')