From 01977eab4e6f208d2fccbe382f82fd2542aebf70 Mon Sep 17 00:00:00 2001 From: Nicholas Kalar Date: Sat, 2 Aug 2025 15:50:06 -0400 Subject: [PATCH] Minor updates --- README.md | 2 +- src/extract.py | 33 ++++++++++++++++++++++----------- src/load.py | 2 +- src/transform.py | 2 +- 4 files changed, 25 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index f33870f..49eaf83 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Environmental Variables: `LOG_FILE` - The file location for logs to be saved to. ## extract.py -The extract.py file contains functions to pull data related to books from different APIs. +The extract.py file contains functions to pull data related to books from different APIs. Currently, this project uses the Google Books and OpenLibrary APIs. The former being the only one that needs an API key. ## transform.py Takes the raw JSON stored by extract.py and transforms the entries into a single entry whose keys diff --git a/src/extract.py b/src/extract.py index 8ad7f34..00eb896 100644 --- a/src/extract.py +++ b/src/extract.py @@ -26,13 +26,21 @@ def extract_book_json(url, header=[]): url -- the url used to make the request. header -- the optional headers passed to specify things needed for the queries, like API keys. ''' - try: - response = requests.get(url, headers=header) - response.raise_for_status() - except requests.exceptions.HTTPError as err: - logger.error(f'An error occurred: {err}') - return {} - return response.json() + attempts = 0 + max_attempts = 5 + response_json = {} + while attempts < max_attempts: + try: + response = requests.get(url, headers=header) + response.raise_for_status() + response_json = response.json() + break + except requests.exceptions.HTTPError as err: + logger.error(f'An error occurred: {err}') + time.sleep(.25) + attempts += 1 + continue + return response_json def get_google_book_data(query, offset=0): ''' @@ -88,7 +96,6 @@ def start(): open_lib_query = f'title={title}' open_lib_books = get_open_library_book_data(open_lib_query) for books in open_lib_books['docs']: - logger.info(f'{datetime.now()}:Book found: {str(books)}') if 'author_name' in books \ and 'title' in books \ and 'isbn' in books: @@ -98,9 +105,13 @@ def start(): google_book_info = get_google_book_data(google_query) if google_book_info != {}: - potential_ol_book = books - potential_ol_book['isbn'] = isbn - + potential_ol_book = { + '_timestamp': str(datetime.now()), + 'author_name': books['author_name'], + 'title': books['title'], + 'isbn': isbn, + } + logger.info(f'{datetime.now()}:Book found: {str(potential_ol_book)}') open_lib_array.append(potential_ol_book) google_books_array.append(google_book_info['items'][0]) time.sleep(.5) diff --git a/src/load.py b/src/load.py index 9c07368..01517b3 100644 --- a/src/load.py +++ b/src/load.py @@ -45,7 +45,7 @@ def start(): with psycopg.connect(f'dbname={db_name} user={db_user} password={db_password}') as conn, \ open(f'output/transformed_{today}.json', 'r') as transformed_books: with conn.cursor() as cur: - cur.execute(f'DROP TABLE IF EXISTS Collection_Item') # TODO: REMOVE WHEN TESTING COMPLETED + # cur.execute(f'DROP TABLE IF EXISTS Collection_Item') # TODO: REMOVE WHEN TESTING COMPLETED cur.execute(collections_table_creation) books = json.loads(transformed_books.read()) diff --git a/src/transform.py b/src/transform.py index 482e502..2cd723b 100644 --- a/src/transform.py +++ b/src/transform.py @@ -48,7 +48,7 @@ def combine_raw_jsons(google_json, ol_json): transformed_dictionary_entry = {} replace_quote = str.maketrans({"'": r"_"}) - title = str(ol_json['book_data'][index]['title']).translate(replace_quote) + title = str(google_json['book_data'][index]['volumeInfo']['title']).translate(replace_quote) author = ', '.join(ol_json['book_data'][index]['author_name']).translate(replace_quote) isbn = ol_json['book_data'][index]['isbn'] sort_title = format_sort_title(title)