Added addition context

This commit is contained in:
2025-11-06 16:32:04 -05:00
parent a141fa76f1
commit 085cc008b4
4 changed files with 40 additions and 6 deletions

View File

@@ -1,5 +1,30 @@
# LMS-DB-ETL
An Extract, Transform, Load app to gather book information from public APIs for a POC LMS project
An Extract, Transform, Load (ETL) app to gather book information from public APIs for
a Proof of Concept Library Management System project.
(Past Git history can be found at: https://github.com/Kalarsoft/LMS-DB-ETL and
https://gitea.com/NickKalar/LMS-DB-ETL)
## Problem
Currently, I am working on building a Library Management System (LMS) to help
develop and showcase my software engineering skills. In order to fully test
and run the LMS, I need to have a database that is populated by a variety of
different media. As I am one person, and have only about 300 books to my name,
this problem needed a better solution that manually adding in those books.
## Solution
This project seeks to seed a database with book details, mostly pulled from
public APIs. The current version uses the Google Books API and Open Library
API. After pulling data from these APIs for several books, the data is merged
and transformed to be loaded into a PostgreSQL database for consumption by the
RESTful APIs associated with the LMS project.
This is a rudimentary ETL pipeline, as it uses no external tools and uses only
2 Python libraries for making the API calls and connecting to the database.
However, it does showcase my understanding of Data Engineering and the ETL
cycle.
## Setup
Environmental Variables:
`GOOGLE_API_KEY` - API Key required for using the Google Books API.

View File

@@ -23,8 +23,9 @@ db_port = os.getenv('DB_PORT')
today = date.today()
def start():
time.sleep(.25)
logger.info(f"{datetime.now()}: Attempting connection...")
# Attempting to connect to the Book Database for loading book information
db_connection_string = f'dbname={db_name} user={db_user} password={db_password} host={db_host} port={db_port}'
with psycopg.connect(db_connection_string) as conn, \
open(f'output/transformed_{today}.json', 'r') as transformed_books:
@@ -49,12 +50,14 @@ def load_transformed_books(cursor, books):
`collection_item` SQL table
'''
for book in books['books']:
# This simulates a library buying multiple copies of a book.
# This for i in range statment simulates a library buying multiple copies of a book.
try:
for i in range(random.randrange(1, 10)):
cursor.execute(sql_statements.collection_insert_statement(book))
logger.info(f'{datetime.now()}:Book {book['title']} loaded {i+1} times.')
except Exception as err:
# if one, or even a series of books are bad, we still want to load what we can.
# Log the error and move on.
logger.error(f'{err} at {book.title}')
if __name__ == '__main__':

View File

@@ -15,6 +15,10 @@ logger = logging.getLogger('extract.py')
today = date.today()
# The Orchestrator is a simple app designed to facilitate the execution of
# the ETL pipeline. Should any system fail, the Orchestrator will log the
# error and exit.
if __name__ == '__main__':
try:
logger.info(f'{datetime.now()}:Starting extract.py')

View File

@@ -39,7 +39,7 @@ def format_sort_title(title):
def combine_raw_jsons(google_json, ol_json):
'''
Returns a dictionary consisting of an array of dictionarys.
Returns a dictionary consisting of an array of dictionaries.
Each child dictionary is a transformed book ready to be
inserted into a database.
@@ -57,6 +57,7 @@ def combine_raw_jsons(google_json, ol_json):
isbn = ol_json['book_data'][index]['isbn']
sort_title = format_sort_title(title)
# Ensuring field variables have valid data
if 'categories' in google_json['book_data'][index]['volumeInfo']:
categories = ', '.join(google_json['book_data'][index]['volumeInfo']['categories'])
else:
@@ -69,6 +70,7 @@ def combine_raw_jsons(google_json, ol_json):
if 'publishedDate' in google_json['book_data'][index]['volumeInfo']:
published_date = google_json['book_data'][index]['volumeInfo']['publishedDate']
# Making sure the publishing date has a month and day associated
if len(published_date) == 4:
published_date += '-12-31'
elif len(published_date) < 10:
@@ -76,8 +78,6 @@ def combine_raw_jsons(google_json, ol_json):
else:
published_date = '9999-12-31'
lost_date = '9999-12-31'
if 'printType' in google_json['book_data'][index]['volumeInfo']:
print_type = google_json['book_data'][index]['volumeInfo']['printType']
else:
@@ -98,6 +98,7 @@ def combine_raw_jsons(google_json, ol_json):
description = ol_json['book_data'][index]['description']
price_in_cents = ol_json['book_data'][index]['price_in_cents']
cover_image_uri = ol_json['book_data'][index]['cover_image_uri']
lost_date = '9999-12-31'
transformed_dictionary_entry = {
'title': title,
@@ -120,6 +121,7 @@ def combine_raw_jsons(google_json, ol_json):
'price_in_cents': price_in_cents,
'cover_image_uri': cover_image_uri,
}
transformed_dictionary['books'].append(transformed_dictionary_entry)
return transformed_dictionary