Added addition context

2025-11-06 16:32:04 -05:00
parent a141fa76f1
commit 085cc008b4
4 changed files with 40 additions and 6 deletions
--- a/README.md
+++ b/README.md
@@ -1,5 +1,30 @@
 # LMS-DB-ETL
-An Extract, Transform, Load app to gather book information from public APIs for a POC LMS project
+An Extract, Transform, Load (ETL) app to gather book information from public APIs for 
+a Proof of Concept Library Management System project.
+
+(Past Git history can be found at: https://github.com/Kalarsoft/LMS-DB-ETL and 
+https://gitea.com/NickKalar/LMS-DB-ETL)
+
+## Problem
+Currently, I am working on building a Library Management System (LMS) to help 
+develop and showcase my software engineering skills. In order to fully test 
+and run the LMS, I need to have a database that is populated by a variety of
+different media. As I am one person, and have only about 300 books to my name,
+this problem needed a better solution that manually adding in those books.
+
+## Solution
+This project seeks to seed a database with book details, mostly pulled from 
+public APIs. The current version uses the Google Books API and Open Library
+API. After pulling data from these APIs for several books, the data is merged 
+and transformed to be loaded into a PostgreSQL database for consumption by the 
+RESTful APIs associated with the LMS project.  
+  
+This is a rudimentary ETL pipeline, as it uses no external tools and uses only 
+2 Python libraries for making the API calls and connecting to the database. 
+However, it does showcase my understanding of Data Engineering and the ETL 
+cycle.
+
+## Setup

 Environmental Variables:  
 `GOOGLE_API_KEY` - API Key required for using the Google Books API.  
--- a/src/load.py
+++ b/src/load.py
@@ -23,8 +23,9 @@ db_port = os.getenv('DB_PORT')
 today = date.today()

 def start():
-    time.sleep(.25)
    logger.info(f"{datetime.now()}: Attempting connection...")
+
+    # Attempting to connect to the Book Database for loading book information
    db_connection_string = f'dbname={db_name} user={db_user} password={db_password} host={db_host} port={db_port}'
    with psycopg.connect(db_connection_string) as conn, \
        open(f'output/transformed_{today}.json', 'r') as transformed_books:
@@ -49,12 +50,14 @@ def load_transformed_books(cursor, books):
                 `collection_item` SQL table
    '''
    for book in books['books']:
-        # This simulates a library buying multiple copies of a book.
+        # This for i in range statment simulates a library buying multiple copies of a book.
        try:
            for i in range(random.randrange(1, 10)):
                cursor.execute(sql_statements.collection_insert_statement(book))
                logger.info(f'{datetime.now()}:Book {book['title']} loaded {i+1} times.')
        except Exception as err:
+             # if one, or even a series of books are bad, we still want to load what we can.
+             # Log the error and move on.
             logger.error(f'{err} at {book.title}')

 if __name__ == '__main__':
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@@ -15,6 +15,10 @@ logger = logging.getLogger('extract.py')

 today = date.today()

+# The Orchestrator is a simple app designed to facilitate the execution of 
+# the ETL pipeline. Should any system fail, the Orchestrator will log the 
+# error and exit.
+
 if __name__ == '__main__':
    try:
        logger.info(f'{datetime.now()}:Starting extract.py')
--- a/src/transform.py
+++ b/src/transform.py
@@ -39,7 +39,7 @@ def format_sort_title(title):
        
 def combine_raw_jsons(google_json, ol_json):
    '''
-        Returns a dictionary consisting of an array of dictionarys. 
+        Returns a dictionary consisting of an array of dictionaries. 
        Each child dictionary is a transformed book ready to be 
        inserted into a database.

@@ -57,6 +57,7 @@ def combine_raw_jsons(google_json, ol_json):
        isbn = ol_json['book_data'][index]['isbn']
        sort_title = format_sort_title(title)

+        # Ensuring field variables have valid data
        if 'categories' in google_json['book_data'][index]['volumeInfo']:
            categories = ', '.join(google_json['book_data'][index]['volumeInfo']['categories'])
        else:
@@ -69,6 +70,7 @@ def combine_raw_jsons(google_json, ol_json):

        if 'publishedDate' in google_json['book_data'][index]['volumeInfo']:
            published_date = google_json['book_data'][index]['volumeInfo']['publishedDate']
+            # Making sure the publishing date has a month and day associated
            if len(published_date) == 4:
                published_date += '-12-31'
            elif len(published_date) < 10:
@@ -76,8 +78,6 @@ def combine_raw_jsons(google_json, ol_json):
        else:
            published_date = '9999-12-31'

-        lost_date = '9999-12-31'
-
        if 'printType' in google_json['book_data'][index]['volumeInfo']:
            print_type = google_json['book_data'][index]['volumeInfo']['printType']
        else:
@@ -98,6 +98,7 @@ def combine_raw_jsons(google_json, ol_json):
        description = ol_json['book_data'][index]['description']
        price_in_cents = ol_json['book_data'][index]['price_in_cents']
        cover_image_uri = ol_json['book_data'][index]['cover_image_uri']
+        lost_date = '9999-12-31'

        transformed_dictionary_entry = {
            'title':                    title,
@@ -120,6 +121,7 @@ def combine_raw_jsons(google_json, ol_json):
            'price_in_cents':           price_in_cents,
            'cover_image_uri':          cover_image_uri,
        }
+
        transformed_dictionary['books'].append(transformed_dictionary_entry)

    return transformed_dictionary