Refactoring
This commit is contained in:
112
src/extract.py
112
src/extract.py
@@ -2,108 +2,90 @@ import os
|
|||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import requests
|
import requests
|
||||||
import json
|
import json
|
||||||
from datetime import date
|
from datetime import date, datetime
|
||||||
import time
|
import time
|
||||||
|
import logging
|
||||||
|
|
||||||
load_dotenv
|
load_dotenv
|
||||||
|
|
||||||
google_api_key = os.getenv('GOOGLE_API_KEY')
|
google_api_key = os.getenv('GOOGLE_API_KEY')
|
||||||
|
google_header = {'key': google_api_key}
|
||||||
|
open_lib_header = {'User-Agent': 'Kalar-LMS nick@kalar.codes'}
|
||||||
|
|
||||||
today = date.today()
|
today = date.today()
|
||||||
|
|
||||||
def extract_book_json(url, header):
|
logger = logging.getLogger('extract.py')
|
||||||
|
logging.basicConfig(filename='lms-etl.log', level=logging.DEBUG)
|
||||||
|
|
||||||
|
def extract_book_json(url, header=[]):
|
||||||
|
'''
|
||||||
|
Returns a dictionary (JSON) of books or an empty dictionary on error.
|
||||||
|
|
||||||
|
Keyword arguments:
|
||||||
|
url -- the url used to make the request.
|
||||||
|
header -- the optional headers passed to specify things needed for the queries, like API keys.
|
||||||
|
'''
|
||||||
try:
|
try:
|
||||||
response = requests.get(url, headers=header)
|
response = requests.get(url, headers=header)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
except requests.exceptions.HTTPError as err:
|
except requests.exceptions.HTTPError as err:
|
||||||
raise SystemError(err)
|
logger.error(f'An error occurred: {err}')
|
||||||
|
return {}
|
||||||
return response.json()
|
return response.json()
|
||||||
|
|
||||||
class GoogleBooks():
|
def get_google_book_data(query, offset=0):
|
||||||
header = {'key': google_api_key}
|
'''
|
||||||
fields = "items(volumeInfo/title,volumeInfo/authors,volumeInfo/publishedDate," \
|
Returns a dictionary of books from the Google Books API based on a query.
|
||||||
"volumeInfo/publisher,volumeInfo/categories,volumeInfo/pageCount,volumeInfo/printType)"
|
|
||||||
|
|
||||||
def fetch_book_data_by_author(self, author, offset=0):
|
Keyword arguments:
|
||||||
author = author.replace(' ', '+')
|
query --
|
||||||
url = (f'https://www.googleapis.com/books/v1/volumes?q=inauthor:{author}'
|
offset -- the optional page offset for a query. Google Books API limits the number
|
||||||
f'&fields={self.fields}&startIndex={offset}')
|
of responses per query and returns an ordered list. This allows you to skip
|
||||||
|
the first x number of responses.
|
||||||
|
'''
|
||||||
|
query = query.replace(' ', '+')
|
||||||
|
fields = ("items(volumeInfo/title,volumeInfo/authors,volumeInfo/publishedDate,"
|
||||||
|
"volumeInfo/publisher,volumeInfo/categories,volumeInfo/pageCount,volumeInfo/printType)")
|
||||||
|
url = (f'https://www.googleapis.com/books/v1/volumes?q={query}'
|
||||||
|
f'&fields={fields}&startIndex={offset}')
|
||||||
|
return extract_book_json(url, google_header)
|
||||||
|
|
||||||
return extract_book_json(url, self.header)
|
def get_open_library_book_data(query, offset=0):
|
||||||
|
'''
|
||||||
|
|
||||||
|
'''
|
||||||
def fetch_book_data_by_title(self, title, offset=0):
|
query = query.replace(' ', '+')
|
||||||
title = title.replace(' ', '+')
|
|
||||||
url = (f'https://www.googleapis.com/books/v1/volumes?q=intitle:{title}'
|
|
||||||
f'&fields={self.fields}&startIndex={offset}')
|
|
||||||
|
|
||||||
return extract_book_json(url, self.header)
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_book_data_by_genre(self, genre, offset=0):
|
|
||||||
genre = genre.replace(' ', '+')
|
|
||||||
url = (f'https://www.googleapis.com/books/v1/volumes?q=subject:{genre}'
|
|
||||||
f'&fields={self.fields}&startIndex={offset}')
|
|
||||||
|
|
||||||
return extract_book_json(url, self.header)
|
|
||||||
|
|
||||||
def fetch_book_data_by_query(self, query, offset=0):
|
|
||||||
url = (f'https://www.googleapis.com/books/v1/volumes?q={query}'
|
|
||||||
f'&fields={self.fields}&startIndex={offset}')
|
|
||||||
|
|
||||||
return extract_book_json(url, self.header)
|
|
||||||
|
|
||||||
|
|
||||||
class OpenLibrary():
|
|
||||||
header = {'User-Agent': 'Kalar-LMS nick@kalar.codes'}
|
|
||||||
fields = 'author_name,title,isbn'
|
fields = 'author_name,title,isbn'
|
||||||
|
url = f'https://openlibrary.org/search.json?{query}&lang=en&fields={fields}'
|
||||||
|
|
||||||
def fetch_book_data_by_author(self, author):
|
return extract_book_json(url, open_lib_header)
|
||||||
author = author.replace(' ', '+')
|
|
||||||
url = f'https://openlibrary.org/search.json?author={author}&lang=en&fields={self.fields}'
|
|
||||||
|
|
||||||
return extract_book_json(url, self.header)
|
|
||||||
|
|
||||||
def fetch_book_data_by_title(self, title):
|
|
||||||
title = title.replace(' ', '+')
|
|
||||||
url = f'https://openlibrary.org/search.json?title={title}&lang=en&fields={self.fields}'
|
|
||||||
|
|
||||||
return extract_book_json(url, self.header)
|
|
||||||
|
|
||||||
def fetch_book_data_by_genre(self, genre):
|
|
||||||
genre = genre.replace(' ', '+')
|
|
||||||
url = f'https://openlibrary.org/search.json?subject={genre}&lang=en&fields={self.fields}'
|
|
||||||
|
|
||||||
return extract_book_json(url, self.header)
|
|
||||||
|
|
||||||
|
|
||||||
def start():
|
def start():
|
||||||
titles = []
|
titles = []
|
||||||
|
google_books_array = []
|
||||||
|
open_lib_array = []
|
||||||
with open('config/title.txt', 'r') as google_books_file:
|
with open('config/title.txt', 'r') as google_books_file:
|
||||||
for line in google_books_file:
|
for line in google_books_file:
|
||||||
titles.append(line.strip())
|
titles.append(line.strip())
|
||||||
|
|
||||||
google_books = GoogleBooks()
|
|
||||||
open_lib = OpenLibrary()
|
|
||||||
|
|
||||||
google_books_array = []
|
|
||||||
open_lib_array = []
|
|
||||||
|
|
||||||
with open(f'output/raw_google_books_{today}.json', 'w') as google_books_file, \
|
with open(f'output/raw_google_books_{today}.json', 'w') as google_books_file, \
|
||||||
open(f'output/raw_open_lib_books_{today}.json', 'w') as open_lib_file:
|
open(f'output/raw_open_lib_books_{today}.json', 'w') as open_lib_file:
|
||||||
google_books_file.write('{"book_data":')
|
google_books_file.write('{"book_data":')
|
||||||
open_lib_file.write('{"book_data":')
|
open_lib_file.write('{"book_data":')
|
||||||
|
|
||||||
for title in titles:
|
for title in titles:
|
||||||
open_lib_books = open_lib.fetch_book_data_by_title(title)
|
open_lib_query = f'title={title}'
|
||||||
|
open_lib_books = get_open_library_book_data(open_lib_query)
|
||||||
for books in open_lib_books['docs']:
|
for books in open_lib_books['docs']:
|
||||||
print(str(books))
|
logger.debug(f'{datetime.now()}:Book found: {str(books)}')
|
||||||
if 'author_name' in books \
|
if 'author_name' in books \
|
||||||
and 'title' in books \
|
and 'title' in books \
|
||||||
and 'isbn' in books:
|
and 'isbn' in books:
|
||||||
for isbn in books['isbn']:
|
for isbn in books['isbn']:
|
||||||
if len(isbn) == 13:
|
if len(isbn) == 13:
|
||||||
query = 'isbn:' + isbn
|
google_query = 'isbn:' + isbn
|
||||||
google_book_info = google_books.fetch_book_data_by_query(query)
|
google_book_info = get_google_book_data(google_query)
|
||||||
|
|
||||||
if google_book_info != {}:
|
if google_book_info != {}:
|
||||||
potential_ol_book = books
|
potential_ol_book = books
|
||||||
@@ -119,4 +101,6 @@ def start():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
print('Starting Extraction.')
|
||||||
start()
|
start()
|
||||||
|
print('Extraction done.')
|
||||||
|
|||||||
Reference in New Issue
Block a user