Example: Multiple Tables

In this example, I show how doctable can be used with multiple relational tables to perform queries which automatically merge different aspects of your dataset when you use .select(). By integrating these relations into the schema, your database can automatically maintain consistency between tables by deleting irrelevant elements when their relations disappear. There are two important features of any multi-table schema using doctable:

(1) Set the foreign_keys=True in the DocTable or ConnectEngine constructor. It is enabled by default. Otherwise sqlalchemy will not enable.

(2) Use the "foreignkey" column type to set the constraint, probably with the onupdate and ondelete keywords specifiied.

I will show two examples here: many-to-many relations, and many-to-one relations.

import datetime
import dataclasses
import tempfile
import sys
sys.path.append('..')
import doctable
tmp = tempfile.TemporaryDirectory()

Many-to-Many Relationships

The premise is that we have an imaginary API where we can get newly released books along with the libraries they are associted with (although they man, in some cases, not have library information). We want to keep track of the set of books with unique titles, and have book information exist on its own (i.e. we can insert book information if it does not have library information). We would also like to keep track of the libraries they belong to. We need this schema to be fast for selection, but it can be slow for insertion.

Primary accesses methods:

insert a book
query books by year of publication
insert a single library and associated books
query books associated with libraries in certain zips

In this example, we are going to use two tables with a many-to-many relationships and a table to handle relationships between them (required for a many-to-many relationship):

BookTable: keeps title and publication year of each book. Should exist independently of LibraryTable, because we may not want to use LibraryTable at all.
LibraryTable: keeps name of library, makes it easy to query by Library.
BookLibraryRelationsTable: keeps track of relationships between BookTable and LibraryTable.

First we define the BookTable table. Because we are primarily interested in books, we will create a separate Book object for working with them.

@doctable.schema(frozen=True, eq=True)
class Book:
    __slots__ = []
    _id: int = doctable.IDCol()
    isbn: str = doctable.Col(unique=True)
    title: str = doctable.Col()
    year: int = doctable.Col()
    author: str = doctable.Col()
    date_updated: datetime.datetime = doctable.UpdatedCol()

class BookTable(doctable.DocTable):
    _tabname_ = 'books'
    _schema_ = Book
    _indices_ = [doctable.Index('isbn_index', 'isbn')]

book_table = BookTable(target=f'{tmp.name}/1.db', new_db=True)

We are not planning to work with author data outside of the schema definition, so we include it as part of the table definition.

@doctable.schema(frozen=True, eq=True)
class Library:
    __slots__ = []
    _id: int = doctable.IDCol()
    name: str = doctable.Col()
    zip: int = doctable.Col()

class LibraryTable(doctable.DocTable):
    _tabname_ = 'libraries'
    _schema_ = Library    
    _constraints_ = [doctable.Constraint('unique', 'name', 'zip')]


library_table = LibraryTable(engine=book_table.engine)

class BookLibraryRelationsTable(doctable.DocTable):
    '''Link between books and libraries.'''
    _tabname_ = 'book_library_relations'

    @doctable.schema
    class _schema_:
        __slots__ = []
        _id: int = doctable.IDCol()
        book_isbn: int = doctable.Col(nullable=False)
        library_id: int = doctable.Col(nullable=False)

    _constraints_ = (
        doctable.Constraint('foreignkey', ('book_isbn',), ('books.isbn',)),
        doctable.Constraint('foreignkey', ('library_id',), ('libraries._id',)),
        doctable.Constraint('unique', 'book_isbn', 'library_id'),
    )

relations_table = BookLibraryRelationsTable(engine=book_table.engine)
relations_table.list_tables()

['book_library_relations', 'books', 'libraries']

Now we create some random books that are not at libraries and add them into our database.

newly_published_books = [
    Book(isbn='A', title='A', year=2020, author='Pierre Bourdieu'),
    Book(isbn='E', title='E', year=2018, author='Jean-Luc Picard'),
]

for book in newly_published_books:
    print(book)

Book(isbn='A', title='A', year=2020, author='Pierre Bourdieu')
Book(isbn='E', title='E', year=2018, author='Jean-Luc Picard')

Now we insert the list of books that were published. It works as expected.

book_table.insert(newly_published_books, ifnotunique='replace')
book_table.head()

	_id	isbn	title	year	author	date_updated
0	1	A	A	2020	Pierre Bourdieu	2022-07-26 21:30:30.364805
1	2	E	E	2018	Jean-Luc Picard	2022-07-26 21:30:30.364812

And now lets add a bunch of books that are associated with library objects.

new_library_books = {
    Library(name='Library1', zip=12345): [
        Book(isbn='A', title='A', year=2020, author='Pierre Bourdieu'),
        Book(isbn='B', title='B', year=2020, author='Pierre Bourdieu'),
    ],
    Library(name='Library2', zip=12345): [
        Book(isbn='A', title='A', year=2020, author='Devin Cornell'),
        Book(isbn='C', title='C', year=2021, author='Jean-Luc Picard'),
    ],
    Library(name='Library3', zip=67890): [
        Book(isbn='A', title='A', year=2020, author='Pierre Bourdieu'),
        Book(isbn='B', title='B', year=2020, author='Jean-Luc Picard'),
        Book(isbn='D', title='D', year=2019, author='Devin Cornell'),
    ],
}

for library, books in new_library_books.items():
    r = library_table.insert(library, ifnotunique='ignore')
    book_table.insert(books, ifnotunique='replace')
    relations_table.insert([{'book_isbn':b.isbn, 'library_id': r.lastrowid} for b in books], ifnotunique='ignore')

book_table.select_df()

	_id	isbn	title	year	author	date_updated
0	2	E	E	2018	Jean-Luc Picard	2022-07-26 21:30:30.364812
1	6	C	C	2021	Jean-Luc Picard	2022-07-26 21:30:30.482867
2	7	A	A	2020	Pierre Bourdieu	2022-07-26 21:30:30.494686
3	8	B	B	2020	Jean-Luc Picard	2022-07-26 21:30:30.494692
4	9	D	D	2019	Devin Cornell	2022-07-26 21:30:30.494694

library_table.select_df()

	_id	name	zip
0	1	Library1	12345
1	2	Library2	12345
2	3	Library3	67890

relations_table.select_df()

	_id	book_isbn	library_id
0	1	A	1
1	2	B	1
2	3	A	2
3	4	C	2
4	5	A	3
5	6	B	3
6	7	D	3

Select Queries That Join Tables

Similar to sqlalchemy, DocTable joins are doen simply by replacing the where conditional. While not technically nessecary, typically you will be joining tables on foreign key columns because it is much faster.

bt, lt, rt = book_table, library_table, relations_table

For the first example, say we want to get the isbn numbers of books associated with each library in zip code 12345. We implement the join using a simple conditional equating the associated keys in each table. Our database schema already knows that the foreign keys are in place, so this expression will give us the join we want.

lt.select([lt['name'], rt['book_isbn']], where=(lt['_id']==rt['library_id']) & (lt['zip']==12345), as_dataclass=False)

[('Library1', 'A'), ('Library1', 'B'), ('Library2', 'A'), ('Library2', 'C')]

Now say we want to characterize each library according to the age distribution of it's books. We use two conditionals for the join: one connecting library table to relations table, and another connecting relations table to books table. We also include the condition to get only libraries associated with the given zip.

conditions = (bt['isbn']==rt['book_isbn']) & (rt['library_id']==lt['_id']) & (lt['zip']==12345)
bt.select([bt['title'], bt['year'], lt['name']], where=conditions, as_dataclass=False)

[('C', 2021, 'Library2'),
 ('A', 2020, 'Library1'),
 ('A', 2020, 'Library2'),
 ('B', 2020, 'Library1')]

Alternatively we can use the .join method of doctable (although I recommend just using select statements).

jt = lt.join(rt, (lt['zip']==12345) & (lt['_id']==rt['library_id']), isouter=False)
bt.select(where=bt['isbn']==jt.c['book_library_relations_book_isbn'], as_dataclass=True)
bt.select([bt['title'], jt.c['book_library_relations_library_id']], where=bt['isbn']==jt.c['book_library_relations_book_isbn'], as_dataclass=False)
bt.select([bt['title'], jt.c['libraries_name']], where=bt['isbn']==jt.c['book_library_relations_book_isbn'], as_dataclass=False, limit=3)

[('C', 'Library1'), ('C', 'Library2'), ('C', 'Library3')]

Many-to-One Relationships

Now we create an author class and table to demonstrate a many-to-one relationship.

@doctable.schema(frozen=True, eq=True)
class Author:
    __slots__ = []
    #_id: int = doctable.IDCol()
    name: str = doctable.Col(primary_key=True, unique=True)
    age: int = doctable.Col()

class AuthorTable(doctable.DocTable):
    _tabname_ = 'authors'
    _schema_ = Author  
    _constraints_ = [doctable.Constraint('foreignkey', ('name',), ('books.author',))]

#book_table_auth = BookTable(target=f'{tmp.name}/16.db', new_db=True)
#author_table = AuthorTable(engine=book_table_auth.engine)
author_table = AuthorTable(engine=book_table.engine)

author_table.delete()
author_table.insert([
    Author(name='Devin Cornell', age=30),
    Author(name='Pierre Bourdieu', age=99),
    Author(name='Jean-Luc Picard', age=1000),
])
author_table.head()

	name	age
0	Devin Cornell	30
1	Pierre Bourdieu	99
2	Jean-Luc Picard	1000

book_table.head()

	_id	isbn	title	year	author	date_updated
0	2	E	E	2018	Jean-Luc Picard	2022-07-26 21:30:30.364812
1	6	C	C	2021	Jean-Luc Picard	2022-07-26 21:30:30.482867
2	7	A	A	2020	Pierre Bourdieu	2022-07-26 21:30:30.494686
3	8	B	B	2020	Jean-Luc Picard	2022-07-26 21:30:30.494692
4	9	D	D	2019	Devin Cornell	2022-07-26 21:30:30.494694

columns = [book_table['year'], author_table['age'], author_table['name']]
where = (book_table['author']==author_table['name']) & (book_table['author'] > 30)
book_table.select_df(columns, where=where)

	year	age	name
0	2018	1000	Jean-Luc Picard
1	2021	1000	Jean-Luc Picard
2	2020	99	Pierre Bourdieu
3	2020	1000	Jean-Luc Picard
4	2019	30	Devin Cornell