141 lines
5.5 KiB
Python
141 lines
5.5 KiB
Python
import os
|
|
import json
|
|
import logging
|
|
from datetime import date, datetime
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
log_file = os.getenv('LOG_FILE')
|
|
logging.basicConfig(filename=log_file, level=logging.INFO)
|
|
logger = logging.getLogger('extract.py')
|
|
|
|
today = date.today()
|
|
|
|
def get_raw_json(path):
|
|
'''
|
|
Returns a dictionary read from a JSON file.
|
|
|
|
Keyword arguments:
|
|
path - the relative path for the JSON file to be read.
|
|
'''
|
|
with open(path, 'r') as json_file:
|
|
return json.loads(json_file.read())
|
|
|
|
def format_sort_title(title):
|
|
'''
|
|
Returns a string with any leading article moved to the end.
|
|
|
|
Keyword arguments:
|
|
title - the string to be formatted.
|
|
'''
|
|
if title.lower().startswith('the '):
|
|
return f'{title[4:]}, {title[0:3]}'
|
|
elif title.lower().startswith('a '):
|
|
return f'{title[2:]}, {title[0]}'
|
|
elif title.lower().startswith('an '):
|
|
return f'{title[3:]}, {title[0:2]}'
|
|
return title
|
|
|
|
def combine_raw_jsons(google_json, ol_json):
|
|
'''
|
|
Returns a dictionary consisting of an array of dictionaries.
|
|
Each child dictionary is a transformed book ready to be
|
|
inserted into a database.
|
|
|
|
Keyword arguments:
|
|
google_json - A dictionary consisting of raw data from the Google Books API
|
|
ol_json - A dictionary consisting of raw data from the OpenLibrary API
|
|
'''
|
|
transformed_dictionary = {'books': []}
|
|
for index in range(len(google_json['book_data'])):
|
|
transformed_dictionary_entry = {}
|
|
replace_quote = str.maketrans({"'": r"_"})
|
|
|
|
title = str(google_json['book_data'][index]['volumeInfo']['title']).translate(replace_quote)
|
|
author = ', '.join(ol_json['book_data'][index]['author_name']).translate(replace_quote)
|
|
isbn = ol_json['book_data'][index]['isbn']
|
|
sort_title = format_sort_title(title)
|
|
|
|
# Ensuring field variables have valid data
|
|
if 'categories' in google_json['book_data'][index]['volumeInfo']:
|
|
categories = ', '.join(google_json['book_data'][index]['volumeInfo']['categories'])
|
|
else:
|
|
categories = ''
|
|
|
|
if 'publisher' in google_json['book_data'][index]['volumeInfo']:
|
|
publisher = str(google_json['book_data'][index]['volumeInfo']['publisher']).translate(replace_quote)
|
|
else:
|
|
publisher = ''
|
|
|
|
if 'publishedDate' in google_json['book_data'][index]['volumeInfo']:
|
|
published_date = google_json['book_data'][index]['volumeInfo']['publishedDate']
|
|
# Making sure the publishing date has a month and day associated
|
|
if len(published_date) == 4:
|
|
published_date += '-12-31'
|
|
elif len(published_date) < 10:
|
|
published_date = published_date[0:3] + '-12-31'
|
|
else:
|
|
published_date = '9999-12-31'
|
|
|
|
if 'printType' in google_json['book_data'][index]['volumeInfo']:
|
|
print_type = google_json['book_data'][index]['volumeInfo']['printType']
|
|
else:
|
|
print_type = ''
|
|
|
|
if 'language' in google_json['book_data'][index]['volumeInfo']:
|
|
language = google_json['book_data'][index]['volumeInfo']['language']
|
|
else:
|
|
language = ''
|
|
|
|
if 'pageCount' in google_json['book_data'][index]['volumeInfo']:
|
|
pageCount = google_json['book_data'][index]['volumeInfo']['pageCount']
|
|
else:
|
|
pageCount = 0
|
|
|
|
loc_number = ol_json['book_data'][index]['loc_number']
|
|
dewey_decimal_number = ol_json['book_data'][index]['dewey_decimal_number']
|
|
description = ol_json['book_data'][index]['description']
|
|
price_in_cents = ol_json['book_data'][index]['price_in_cents']
|
|
cover_image_uri = ol_json['book_data'][index]['cover_image_uri']
|
|
lost_date = '9999-12-31'
|
|
|
|
transformed_dictionary_entry = {
|
|
'title': title,
|
|
'author': author,
|
|
'publisher': publisher,
|
|
'publishing_date': published_date,
|
|
'isbn': isbn,
|
|
'sort_title': sort_title,
|
|
'format': print_type,
|
|
'language': language,
|
|
'categories': categories,
|
|
'page_count': pageCount,
|
|
'is_checked_in': True,
|
|
'is_archived': False,
|
|
'is_lost': False,
|
|
'lost_date': lost_date,
|
|
'loc_number': loc_number,
|
|
'dewey_decimal_number': dewey_decimal_number,
|
|
'description': description,
|
|
'price_in_cents': price_in_cents,
|
|
'cover_image_uri': cover_image_uri,
|
|
}
|
|
|
|
transformed_dictionary['books'].append(transformed_dictionary_entry)
|
|
|
|
return transformed_dictionary
|
|
|
|
def start():
|
|
google_json = get_raw_json(f'output/raw_google_books_{today}.json')
|
|
ol_json = get_raw_json(f'output/raw_open_lib_books_{today}.json')
|
|
with open(f'output/transformed_{today}.json', 'w') as transformed:
|
|
transformed.write(json.dumps(combine_raw_jsons(google_json, ol_json)))
|
|
|
|
if __name__ == '__main__':
|
|
print('Transformation Started')
|
|
logger.info(f'{datetime.now()}:Transformation Started')
|
|
start()
|
|
print('Transformation Done')
|
|
logger.info(f'{datetime.now()}:Transformation Done')
|