Organized extract.py

This commit is contained in:
2025-06-16 22:37:39 -04:00
parent 3d7d124883
commit 36fb1a05f1
2 changed files with 52 additions and 59 deletions

View File

@@ -1,3 +1,5 @@
Everything is Tuberculosis Everything is Tuberculosis
The Fellowship of the Ring The Fellowship of the Ring
Pale Blue Dot Pale Blue Dot
The New Jim Crow
Leviathan Wakes

View File

@@ -1,13 +1,16 @@
import os import os
from dotenv import load_dotenv
import requests import requests
import json import json
from datetime import date from datetime import date
import time import time
load_dotenv
google_api_key = os.getenv('GOOGLE_API_KEY') google_api_key = os.getenv('GOOGLE_API_KEY')
today = date.today() today = date.today()
def extract_book_data(url, header): def extract_book_json(url, header):
try: try:
response = requests.get(url, headers=header) response = requests.get(url, headers=header)
response.raise_for_status() response.raise_for_status()
@@ -25,7 +28,7 @@ class GoogleBooks():
url = (f'https://www.googleapis.com/books/v1/volumes?q=inauthor:{author}' url = (f'https://www.googleapis.com/books/v1/volumes?q=inauthor:{author}'
f'&fields={self.fields}&startIndex={offset}') f'&fields={self.fields}&startIndex={offset}')
return extract_book_data(url, self.header) return extract_book_json(url, self.header)
def fetch_book_data_by_title(self, title, offset=0): def fetch_book_data_by_title(self, title, offset=0):
@@ -33,7 +36,7 @@ class GoogleBooks():
url = (f'https://www.googleapis.com/books/v1/volumes?q=intitle:{title}' url = (f'https://www.googleapis.com/books/v1/volumes?q=intitle:{title}'
f'&fields={self.fields}&startIndex={offset}') f'&fields={self.fields}&startIndex={offset}')
return extract_book_data(url, self.header) return extract_book_json(url, self.header)
def fetch_book_data_by_genre(self, genre, offset=0): def fetch_book_data_by_genre(self, genre, offset=0):
@@ -41,13 +44,13 @@ class GoogleBooks():
url = (f'https://www.googleapis.com/books/v1/volumes?q=subject:{genre}' url = (f'https://www.googleapis.com/books/v1/volumes?q=subject:{genre}'
f'&fields={self.fields}&startIndex={offset}') f'&fields={self.fields}&startIndex={offset}')
return extract_book_data(url, self.header) return extract_book_json(url, self.header)
def fetch_book_data_by_query(self, query, offset=0): def fetch_book_data_by_query(self, query, offset=0):
url = (f'https://www.googleapis.com/books/v1/volumes?q={query}' url = (f'https://www.googleapis.com/books/v1/volumes?q={query}'
f'&fields={self.fields}&startIndex={offset}') f'&fields={self.fields}&startIndex={offset}')
return extract_book_data(url, self.header) return extract_book_json(url, self.header)
class OpenLibrary(): class OpenLibrary():
@@ -58,74 +61,62 @@ class OpenLibrary():
author = author.replace(' ', '+') author = author.replace(' ', '+')
url = f'https://openlibrary.org/search.json?author={author}&lang=en&fields={self.fields}' url = f'https://openlibrary.org/search.json?author={author}&lang=en&fields={self.fields}'
return extract_book_data(url, self.header) return extract_book_json(url, self.header)
def fetch_book_data_by_title(self, title): def fetch_book_data_by_title(self, title):
title = title.replace(' ', '+') title = title.replace(' ', '+')
url = f'https://openlibrary.org/search.json?title={title}&lang=en&fields={self.fields}' url = f'https://openlibrary.org/search.json?title={title}&lang=en&fields={self.fields}'
return extract_book_data(url, self.header) return extract_book_json(url, self.header)
def fetch_book_data_by_genre(self, genre): def fetch_book_data_by_genre(self, genre):
genre = genre.replace(' ', '+') genre = genre.replace(' ', '+')
url = f'https://openlibrary.org/search.json?subject={genre}&lang=en&fields={self.fields}' url = f'https://openlibrary.org/search.json?subject={genre}&lang=en&fields={self.fields}'
return extract_book_data(url, self.header) return extract_book_json(url, self.header)
def write_open_lib_json(open_lib): def start():
open_lib_json = json.dumps(open_lib.fetch_book_data_by_title('Pale Blue Dot'), indent=4)
with open(f'output/open_lib_{today}.json', 'w') as f:
f.write(open_lib_json)
def write_google_books_json(google_books, query):
google_books_json = json.dumps(google_books.fetch_book_data_by_query(query), indent=4)
with open(f'output/google_books_{today}.json', 'a') as f:
if google_books_json != None:
f.write(google_books_json)
def get_google_books_info(google_books, query):
return google_books.fetch_book_data_by_query(query)
if __name__ == '__main__':
titles = [] titles = []
with open('config/title.txt', 'r') as f: with open('config/title.txt', 'r') as google_books_file:
for line in f: for line in google_books_file:
titles.append(line.strip()) titles.append(line.strip())
google_books = GoogleBooks() google_books = GoogleBooks()
open_lib = OpenLibrary() open_lib = OpenLibrary()
google_books_json = {'items':[]} google_books_array = []
open_lib_json = {'items':[]} open_lib_array = []
with open(f'output/raw_google_books_{today}.json', 'w') as google_books_file, \
open(f'output/raw_open_lib_books_{today}.json', 'w') as open_lib_file:
google_books_file.write('{"book_data":')
open_lib_file.write('{"book_data":')
for title in titles: for title in titles:
open_lib_books = open_lib.fetch_book_data_by_title(title) open_lib_books = open_lib.fetch_book_data_by_title(title)
for books in open_lib_books['docs']: for books in open_lib_books['docs']:
potential_book = { print(str(books))
'author': books['author_name'], if 'author_name' in books \
'title': books['title'], and 'title' in books \
} and 'isbn' in books:
for isbn in books['isbn']: for isbn in books['isbn']:
if len(isbn) == 13: if len(isbn) == 13:
query = 'isbn:' + isbn query = 'isbn:' + isbn
book_info = get_google_books_info(google_books, query) google_book_info = google_books.fetch_book_data_by_query(query)
if book_info != {}:
potential_book['isbn'] = isbn if google_book_info != {}:
potential_ol_book = books
potential_ol_book['isbn'] = isbn
open_lib_json['items'].append(potential_book) open_lib_array.append(potential_ol_book)
google_books_json['items'].append(book_info['items'][0]) google_books_array.append(google_book_info['items'][0])
time.sleep(.5)
time.sleep(.5)
with open(f'output/google_books_{today}.json', 'a') as f: google_books_file.write(json.dumps(google_books_array)+'}')
f.write(json.dumps(google_books_json)+',') open_lib_file.write(json.dumps(open_lib_array)+'}')
with open(f'output/open_lib_books_{today}.json', 'a') as f:
f.write(json.dumps(open_lib_json)+',')
print('Title Done') if __name__ == '__main__':
time.sleep(5) start()
print('Starting Next')