diff --git a/README.md b/README.md index d07ef5a..a3f17b9 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,30 @@ # LMS-DB-ETL -An Extract, Transform, Load app to gather book information from public APIs for a POC LMS project +An Extract, Transform, Load (ETL) app to gather book information from public APIs for +a Proof of Concept Library Management System project. + +(Past Git history can be found at: https://github.com/Kalarsoft/LMS-DB-ETL and +https://gitea.com/NickKalar/LMS-DB-ETL) + +## Problem +Currently, I am working on building a Library Management System (LMS) to help +develop and showcase my software engineering skills. In order to fully test +and run the LMS, I need to have a database that is populated by a variety of +different media. As I am one person, and have only about 300 books to my name, +this problem needed a better solution that manually adding in those books. + +## Solution +This project seeks to seed a database with book details, mostly pulled from +public APIs. The current version uses the Google Books API and Open Library +API. After pulling data from these APIs for several books, the data is merged +and transformed to be loaded into a PostgreSQL database for consumption by the +RESTful APIs associated with the LMS project. + +This is a rudimentary ETL pipeline, as it uses no external tools and uses only +2 Python libraries for making the API calls and connecting to the database. +However, it does showcase my understanding of Data Engineering and the ETL +cycle. + +## Setup Environmental Variables: `GOOGLE_API_KEY` - API Key required for using the Google Books API. diff --git a/src/load.py b/src/load.py index 504a9c2..c5ed1aa 100644 --- a/src/load.py +++ b/src/load.py @@ -23,8 +23,9 @@ db_port = os.getenv('DB_PORT') today = date.today() def start(): - time.sleep(.25) logger.info(f"{datetime.now()}: Attempting connection...") + + # Attempting to connect to the Book Database for loading book information db_connection_string = f'dbname={db_name} user={db_user} password={db_password} host={db_host} port={db_port}' with psycopg.connect(db_connection_string) as conn, \ open(f'output/transformed_{today}.json', 'r') as transformed_books: @@ -49,12 +50,14 @@ def load_transformed_books(cursor, books): `collection_item` SQL table ''' for book in books['books']: - # This simulates a library buying multiple copies of a book. + # This for i in range statment simulates a library buying multiple copies of a book. try: for i in range(random.randrange(1, 10)): cursor.execute(sql_statements.collection_insert_statement(book)) logger.info(f'{datetime.now()}:Book {book['title']} loaded {i+1} times.') except Exception as err: + # if one, or even a series of books are bad, we still want to load what we can. + # Log the error and move on. logger.error(f'{err} at {book.title}') if __name__ == '__main__': diff --git a/src/orchestrator.py b/src/orchestrator.py index 3246c53..0207ebe 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -15,6 +15,10 @@ logger = logging.getLogger('extract.py') today = date.today() +# The Orchestrator is a simple app designed to facilitate the execution of +# the ETL pipeline. Should any system fail, the Orchestrator will log the +# error and exit. + if __name__ == '__main__': try: logger.info(f'{datetime.now()}:Starting extract.py') diff --git a/src/transform.py b/src/transform.py index 479e108..7f625b3 100644 --- a/src/transform.py +++ b/src/transform.py @@ -39,7 +39,7 @@ def format_sort_title(title): def combine_raw_jsons(google_json, ol_json): ''' - Returns a dictionary consisting of an array of dictionarys. + Returns a dictionary consisting of an array of dictionaries. Each child dictionary is a transformed book ready to be inserted into a database. @@ -57,6 +57,7 @@ def combine_raw_jsons(google_json, ol_json): isbn = ol_json['book_data'][index]['isbn'] sort_title = format_sort_title(title) + # Ensuring field variables have valid data if 'categories' in google_json['book_data'][index]['volumeInfo']: categories = ', '.join(google_json['book_data'][index]['volumeInfo']['categories']) else: @@ -69,6 +70,7 @@ def combine_raw_jsons(google_json, ol_json): if 'publishedDate' in google_json['book_data'][index]['volumeInfo']: published_date = google_json['book_data'][index]['volumeInfo']['publishedDate'] + # Making sure the publishing date has a month and day associated if len(published_date) == 4: published_date += '-12-31' elif len(published_date) < 10: @@ -76,8 +78,6 @@ def combine_raw_jsons(google_json, ol_json): else: published_date = '9999-12-31' - lost_date = '9999-12-31' - if 'printType' in google_json['book_data'][index]['volumeInfo']: print_type = google_json['book_data'][index]['volumeInfo']['printType'] else: @@ -98,6 +98,7 @@ def combine_raw_jsons(google_json, ol_json): description = ol_json['book_data'][index]['description'] price_in_cents = ol_json['book_data'][index]['price_in_cents'] cover_image_uri = ol_json['book_data'][index]['cover_image_uri'] + lost_date = '9999-12-31' transformed_dictionary_entry = { 'title': title, @@ -120,6 +121,7 @@ def combine_raw_jsons(google_json, ol_json): 'price_in_cents': price_in_cents, 'cover_image_uri': cover_image_uri, } + transformed_dictionary['books'].append(transformed_dictionary_entry) return transformed_dictionary