Added addition context
This commit is contained in:
27
README.md
27
README.md
@@ -1,5 +1,30 @@
|
||||
# LMS-DB-ETL
|
||||
An Extract, Transform, Load app to gather book information from public APIs for a POC LMS project
|
||||
An Extract, Transform, Load (ETL) app to gather book information from public APIs for
|
||||
a Proof of Concept Library Management System project.
|
||||
|
||||
(Past Git history can be found at: https://github.com/Kalarsoft/LMS-DB-ETL and
|
||||
https://gitea.com/NickKalar/LMS-DB-ETL)
|
||||
|
||||
## Problem
|
||||
Currently, I am working on building a Library Management System (LMS) to help
|
||||
develop and showcase my software engineering skills. In order to fully test
|
||||
and run the LMS, I need to have a database that is populated by a variety of
|
||||
different media. As I am one person, and have only about 300 books to my name,
|
||||
this problem needed a better solution that manually adding in those books.
|
||||
|
||||
## Solution
|
||||
This project seeks to seed a database with book details, mostly pulled from
|
||||
public APIs. The current version uses the Google Books API and Open Library
|
||||
API. After pulling data from these APIs for several books, the data is merged
|
||||
and transformed to be loaded into a PostgreSQL database for consumption by the
|
||||
RESTful APIs associated with the LMS project.
|
||||
|
||||
This is a rudimentary ETL pipeline, as it uses no external tools and uses only
|
||||
2 Python libraries for making the API calls and connecting to the database.
|
||||
However, it does showcase my understanding of Data Engineering and the ETL
|
||||
cycle.
|
||||
|
||||
## Setup
|
||||
|
||||
Environmental Variables:
|
||||
`GOOGLE_API_KEY` - API Key required for using the Google Books API.
|
||||
|
||||
@@ -23,8 +23,9 @@ db_port = os.getenv('DB_PORT')
|
||||
today = date.today()
|
||||
|
||||
def start():
|
||||
time.sleep(.25)
|
||||
logger.info(f"{datetime.now()}: Attempting connection...")
|
||||
|
||||
# Attempting to connect to the Book Database for loading book information
|
||||
db_connection_string = f'dbname={db_name} user={db_user} password={db_password} host={db_host} port={db_port}'
|
||||
with psycopg.connect(db_connection_string) as conn, \
|
||||
open(f'output/transformed_{today}.json', 'r') as transformed_books:
|
||||
@@ -49,12 +50,14 @@ def load_transformed_books(cursor, books):
|
||||
`collection_item` SQL table
|
||||
'''
|
||||
for book in books['books']:
|
||||
# This simulates a library buying multiple copies of a book.
|
||||
# This for i in range statment simulates a library buying multiple copies of a book.
|
||||
try:
|
||||
for i in range(random.randrange(1, 10)):
|
||||
cursor.execute(sql_statements.collection_insert_statement(book))
|
||||
logger.info(f'{datetime.now()}:Book {book['title']} loaded {i+1} times.')
|
||||
except Exception as err:
|
||||
# if one, or even a series of books are bad, we still want to load what we can.
|
||||
# Log the error and move on.
|
||||
logger.error(f'{err} at {book.title}')
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -15,6 +15,10 @@ logger = logging.getLogger('extract.py')
|
||||
|
||||
today = date.today()
|
||||
|
||||
# The Orchestrator is a simple app designed to facilitate the execution of
|
||||
# the ETL pipeline. Should any system fail, the Orchestrator will log the
|
||||
# error and exit.
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
logger.info(f'{datetime.now()}:Starting extract.py')
|
||||
|
||||
@@ -39,7 +39,7 @@ def format_sort_title(title):
|
||||
|
||||
def combine_raw_jsons(google_json, ol_json):
|
||||
'''
|
||||
Returns a dictionary consisting of an array of dictionarys.
|
||||
Returns a dictionary consisting of an array of dictionaries.
|
||||
Each child dictionary is a transformed book ready to be
|
||||
inserted into a database.
|
||||
|
||||
@@ -57,6 +57,7 @@ def combine_raw_jsons(google_json, ol_json):
|
||||
isbn = ol_json['book_data'][index]['isbn']
|
||||
sort_title = format_sort_title(title)
|
||||
|
||||
# Ensuring field variables have valid data
|
||||
if 'categories' in google_json['book_data'][index]['volumeInfo']:
|
||||
categories = ', '.join(google_json['book_data'][index]['volumeInfo']['categories'])
|
||||
else:
|
||||
@@ -69,6 +70,7 @@ def combine_raw_jsons(google_json, ol_json):
|
||||
|
||||
if 'publishedDate' in google_json['book_data'][index]['volumeInfo']:
|
||||
published_date = google_json['book_data'][index]['volumeInfo']['publishedDate']
|
||||
# Making sure the publishing date has a month and day associated
|
||||
if len(published_date) == 4:
|
||||
published_date += '-12-31'
|
||||
elif len(published_date) < 10:
|
||||
@@ -76,8 +78,6 @@ def combine_raw_jsons(google_json, ol_json):
|
||||
else:
|
||||
published_date = '9999-12-31'
|
||||
|
||||
lost_date = '9999-12-31'
|
||||
|
||||
if 'printType' in google_json['book_data'][index]['volumeInfo']:
|
||||
print_type = google_json['book_data'][index]['volumeInfo']['printType']
|
||||
else:
|
||||
@@ -98,6 +98,7 @@ def combine_raw_jsons(google_json, ol_json):
|
||||
description = ol_json['book_data'][index]['description']
|
||||
price_in_cents = ol_json['book_data'][index]['price_in_cents']
|
||||
cover_image_uri = ol_json['book_data'][index]['cover_image_uri']
|
||||
lost_date = '9999-12-31'
|
||||
|
||||
transformed_dictionary_entry = {
|
||||
'title': title,
|
||||
@@ -120,6 +121,7 @@ def combine_raw_jsons(google_json, ol_json):
|
||||
'price_in_cents': price_in_cents,
|
||||
'cover_image_uri': cover_image_uri,
|
||||
}
|
||||
|
||||
transformed_dictionary['books'].append(transformed_dictionary_entry)
|
||||
|
||||
return transformed_dictionary
|
||||
|
||||
Reference in New Issue
Block a user