Added addition context
This commit is contained in:
27
README.md
27
README.md
@@ -1,5 +1,30 @@
|
|||||||
# LMS-DB-ETL
|
# LMS-DB-ETL
|
||||||
An Extract, Transform, Load app to gather book information from public APIs for a POC LMS project
|
An Extract, Transform, Load (ETL) app to gather book information from public APIs for
|
||||||
|
a Proof of Concept Library Management System project.
|
||||||
|
|
||||||
|
(Past Git history can be found at: https://github.com/Kalarsoft/LMS-DB-ETL and
|
||||||
|
https://gitea.com/NickKalar/LMS-DB-ETL)
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
Currently, I am working on building a Library Management System (LMS) to help
|
||||||
|
develop and showcase my software engineering skills. In order to fully test
|
||||||
|
and run the LMS, I need to have a database that is populated by a variety of
|
||||||
|
different media. As I am one person, and have only about 300 books to my name,
|
||||||
|
this problem needed a better solution that manually adding in those books.
|
||||||
|
|
||||||
|
## Solution
|
||||||
|
This project seeks to seed a database with book details, mostly pulled from
|
||||||
|
public APIs. The current version uses the Google Books API and Open Library
|
||||||
|
API. After pulling data from these APIs for several books, the data is merged
|
||||||
|
and transformed to be loaded into a PostgreSQL database for consumption by the
|
||||||
|
RESTful APIs associated with the LMS project.
|
||||||
|
|
||||||
|
This is a rudimentary ETL pipeline, as it uses no external tools and uses only
|
||||||
|
2 Python libraries for making the API calls and connecting to the database.
|
||||||
|
However, it does showcase my understanding of Data Engineering and the ETL
|
||||||
|
cycle.
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
Environmental Variables:
|
Environmental Variables:
|
||||||
`GOOGLE_API_KEY` - API Key required for using the Google Books API.
|
`GOOGLE_API_KEY` - API Key required for using the Google Books API.
|
||||||
|
|||||||
@@ -23,8 +23,9 @@ db_port = os.getenv('DB_PORT')
|
|||||||
today = date.today()
|
today = date.today()
|
||||||
|
|
||||||
def start():
|
def start():
|
||||||
time.sleep(.25)
|
|
||||||
logger.info(f"{datetime.now()}: Attempting connection...")
|
logger.info(f"{datetime.now()}: Attempting connection...")
|
||||||
|
|
||||||
|
# Attempting to connect to the Book Database for loading book information
|
||||||
db_connection_string = f'dbname={db_name} user={db_user} password={db_password} host={db_host} port={db_port}'
|
db_connection_string = f'dbname={db_name} user={db_user} password={db_password} host={db_host} port={db_port}'
|
||||||
with psycopg.connect(db_connection_string) as conn, \
|
with psycopg.connect(db_connection_string) as conn, \
|
||||||
open(f'output/transformed_{today}.json', 'r') as transformed_books:
|
open(f'output/transformed_{today}.json', 'r') as transformed_books:
|
||||||
@@ -49,12 +50,14 @@ def load_transformed_books(cursor, books):
|
|||||||
`collection_item` SQL table
|
`collection_item` SQL table
|
||||||
'''
|
'''
|
||||||
for book in books['books']:
|
for book in books['books']:
|
||||||
# This simulates a library buying multiple copies of a book.
|
# This for i in range statment simulates a library buying multiple copies of a book.
|
||||||
try:
|
try:
|
||||||
for i in range(random.randrange(1, 10)):
|
for i in range(random.randrange(1, 10)):
|
||||||
cursor.execute(sql_statements.collection_insert_statement(book))
|
cursor.execute(sql_statements.collection_insert_statement(book))
|
||||||
logger.info(f'{datetime.now()}:Book {book['title']} loaded {i+1} times.')
|
logger.info(f'{datetime.now()}:Book {book['title']} loaded {i+1} times.')
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
|
# if one, or even a series of books are bad, we still want to load what we can.
|
||||||
|
# Log the error and move on.
|
||||||
logger.error(f'{err} at {book.title}')
|
logger.error(f'{err} at {book.title}')
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
@@ -15,6 +15,10 @@ logger = logging.getLogger('extract.py')
|
|||||||
|
|
||||||
today = date.today()
|
today = date.today()
|
||||||
|
|
||||||
|
# The Orchestrator is a simple app designed to facilitate the execution of
|
||||||
|
# the ETL pipeline. Should any system fail, the Orchestrator will log the
|
||||||
|
# error and exit.
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
try:
|
try:
|
||||||
logger.info(f'{datetime.now()}:Starting extract.py')
|
logger.info(f'{datetime.now()}:Starting extract.py')
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ def format_sort_title(title):
|
|||||||
|
|
||||||
def combine_raw_jsons(google_json, ol_json):
|
def combine_raw_jsons(google_json, ol_json):
|
||||||
'''
|
'''
|
||||||
Returns a dictionary consisting of an array of dictionarys.
|
Returns a dictionary consisting of an array of dictionaries.
|
||||||
Each child dictionary is a transformed book ready to be
|
Each child dictionary is a transformed book ready to be
|
||||||
inserted into a database.
|
inserted into a database.
|
||||||
|
|
||||||
@@ -57,6 +57,7 @@ def combine_raw_jsons(google_json, ol_json):
|
|||||||
isbn = ol_json['book_data'][index]['isbn']
|
isbn = ol_json['book_data'][index]['isbn']
|
||||||
sort_title = format_sort_title(title)
|
sort_title = format_sort_title(title)
|
||||||
|
|
||||||
|
# Ensuring field variables have valid data
|
||||||
if 'categories' in google_json['book_data'][index]['volumeInfo']:
|
if 'categories' in google_json['book_data'][index]['volumeInfo']:
|
||||||
categories = ', '.join(google_json['book_data'][index]['volumeInfo']['categories'])
|
categories = ', '.join(google_json['book_data'][index]['volumeInfo']['categories'])
|
||||||
else:
|
else:
|
||||||
@@ -69,6 +70,7 @@ def combine_raw_jsons(google_json, ol_json):
|
|||||||
|
|
||||||
if 'publishedDate' in google_json['book_data'][index]['volumeInfo']:
|
if 'publishedDate' in google_json['book_data'][index]['volumeInfo']:
|
||||||
published_date = google_json['book_data'][index]['volumeInfo']['publishedDate']
|
published_date = google_json['book_data'][index]['volumeInfo']['publishedDate']
|
||||||
|
# Making sure the publishing date has a month and day associated
|
||||||
if len(published_date) == 4:
|
if len(published_date) == 4:
|
||||||
published_date += '-12-31'
|
published_date += '-12-31'
|
||||||
elif len(published_date) < 10:
|
elif len(published_date) < 10:
|
||||||
@@ -76,8 +78,6 @@ def combine_raw_jsons(google_json, ol_json):
|
|||||||
else:
|
else:
|
||||||
published_date = '9999-12-31'
|
published_date = '9999-12-31'
|
||||||
|
|
||||||
lost_date = '9999-12-31'
|
|
||||||
|
|
||||||
if 'printType' in google_json['book_data'][index]['volumeInfo']:
|
if 'printType' in google_json['book_data'][index]['volumeInfo']:
|
||||||
print_type = google_json['book_data'][index]['volumeInfo']['printType']
|
print_type = google_json['book_data'][index]['volumeInfo']['printType']
|
||||||
else:
|
else:
|
||||||
@@ -98,6 +98,7 @@ def combine_raw_jsons(google_json, ol_json):
|
|||||||
description = ol_json['book_data'][index]['description']
|
description = ol_json['book_data'][index]['description']
|
||||||
price_in_cents = ol_json['book_data'][index]['price_in_cents']
|
price_in_cents = ol_json['book_data'][index]['price_in_cents']
|
||||||
cover_image_uri = ol_json['book_data'][index]['cover_image_uri']
|
cover_image_uri = ol_json['book_data'][index]['cover_image_uri']
|
||||||
|
lost_date = '9999-12-31'
|
||||||
|
|
||||||
transformed_dictionary_entry = {
|
transformed_dictionary_entry = {
|
||||||
'title': title,
|
'title': title,
|
||||||
@@ -120,6 +121,7 @@ def combine_raw_jsons(google_json, ol_json):
|
|||||||
'price_in_cents': price_in_cents,
|
'price_in_cents': price_in_cents,
|
||||||
'cover_image_uri': cover_image_uri,
|
'cover_image_uri': cover_image_uri,
|
||||||
}
|
}
|
||||||
|
|
||||||
transformed_dictionary['books'].append(transformed_dictionary_entry)
|
transformed_dictionary['books'].append(transformed_dictionary_entry)
|
||||||
|
|
||||||
return transformed_dictionary
|
return transformed_dictionary
|
||||||
|
|||||||
Reference in New Issue
Block a user