Here I show how to select data from a DocTable. We cover object-oriented conditional selects emulating the WHERE
SQL clause, as well as some reduce functions.
import random
import pandas as pd
import numpy as np
import sys
sys.path.append('..')
import doctable
import dataclasses
@doctable.schema
class Record:
__slots__ = []
id: int = doctable.IDCol()
name: str = doctable.Col(nullable=False)
age: int = None
is_old: bool = None
table = doctable.DocTable(target=':memory:', schema=Record, verbose=True)
print(table)
<DocTable (4 cols)::sqlite:///:memory::_documents_>
N = 10
for i in range(N):
age = random.random() # number in [0,1]
is_old = age > 0.5
table.insert({'name':'user_'+str(i), 'age':age, 'is_old':is_old}, verbose=False)
print(table)
<DocTable (4 cols)::sqlite:///:memory::_documents_>
These functions all return lists of ResultProxy objects. As such, they can be accessed using numerical indices or keyword indices. For instance, if one select output row is row=(1, 'user_0')
(after selecting "id" and "user"), it can be accessed such that row[0]==row['id']
and row[1]==row['user']
.
# the limit argument means the result will only return some rows.
# I'll use it for convenience in these examples.
# this selects all rows
table.select(limit=2)
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ LIMIT ? OFFSET ?
[Record(id=1, name='user_0', age=0.04379398878146823, is_old=False), Record(id=2, name='user_1', age=0.32083938549074387, is_old=False)]
table.select(['id','name'], limit=1)
DocTable: SELECT _documents_.id, _documents_.name FROM _documents_ LIMIT ? OFFSET ?
[Record(id=1, name='user_0', age=None, is_old=None)]
# can also select by accessing the column object (db['id']) itself
# this will be useful later with more complex queries
table.select([table['id'],table['name']], limit=1)
DocTable: SELECT _documents_.id, _documents_.name FROM _documents_ LIMIT ? OFFSET ?
[Record(id=1, name='user_0', age=None, is_old=None)]
table.select_first()
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ LIMIT ? OFFSET ?
Record(id=1, name='user_0', age=0.04379398878146823, is_old=False)
table.select('name',limit=5)
DocTable: SELECT _documents_.name FROM _documents_ LIMIT ? OFFSET ?
['user_0', 'user_1', 'user_2', 'user_3', 'user_4']
table.select_first('age')
DocTable: SELECT _documents_.age FROM _documents_ LIMIT ? OFFSET ?
0.04379398878146823
table.select(where=table['id']==2)
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ WHERE _documents_.id = ?
[Record(id=2, name='user_1', age=0.32083938549074387, is_old=False)]
table.select(where=table['id']<3)
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ WHERE _documents_.id < ?
[Record(id=1, name='user_0', age=0.04379398878146823, is_old=False), Record(id=2, name='user_1', age=0.32083938549074387, is_old=False)]
# mod operator works too
table.select(where=(table['id']%2)==0, limit=2)
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ WHERE _documents_.id % ? = ? LIMIT ? OFFSET ?
[Record(id=2, name='user_1', age=0.32083938549074387, is_old=False), Record(id=4, name='user_3', age=0.5649107128684847, is_old=True)]
# note parantheses to handle order of ops with overloaded bitwise ops
table.select(where= (table['id']>=2) & (table['id']<=4) & (table['name']!='user_2'))
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ WHERE _documents_.id >= ? AND _documents_.id <= ? AND _documents_.name != ?
[Record(id=2, name='user_1', age=0.32083938549074387, is_old=False), Record(id=4, name='user_3', age=0.5649107128684847, is_old=True)]
table.select(where=table['name'].in_(('user_2','user_3')))
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ WHERE _documents_.name IN (?, ?)
[Record(id=3, name='user_2', age=0.10213129577978541, is_old=False), Record(id=4, name='user_3', age=0.5649107128684847, is_old=True)]
table.select(where=table['id'].between(2,4))
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ WHERE _documents_.id BETWEEN ? AND ?
[Record(id=2, name='user_1', age=0.32083938549074387, is_old=False), Record(id=3, name='user_2', age=0.10213129577978541, is_old=False), Record(id=4, name='user_3', age=0.5649107128684847, is_old=True)]
# use of logical not operator "~"
table.select(where= ~(table['name'].in_(('user_2','user_3'))) & (table['id'] < 4))
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ WHERE _documents_.name NOT IN (?, ?) AND _documents_.id < ?
[Record(id=1, name='user_0', age=0.04379398878146823, is_old=False), Record(id=2, name='user_1', age=0.32083938549074387, is_old=False)]
# more verbose operators .and_, .or_, and .not_ are bound to the doctable package
table.select(where= doctable.or_(doctable.not_(table['id']==4)) & (table['id'] <= 2))
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ WHERE _documents_.id != ? AND _documents_.id <= ?
[Record(id=1, name='user_0', age=0.04379398878146823, is_old=False), Record(id=2, name='user_1', age=0.32083938549074387, is_old=False)]
# now with simple computation
ages = table.select(table['age'])
mean_age = sum(ages)/len(ages)
table.select(table['name'], where=table['age']>mean_age, limit=2)
DocTable: SELECT _documents_.age FROM _documents_ DocTable: SELECT _documents_.name FROM _documents_ WHERE _documents_.age > ? LIMIT ? OFFSET ?
['user_3', 'user_5']
# apply .label() method to columns
dict(table.select_first([table['age'].label('myage'), table['name'].label('myname')], as_dataclass=False))
DocTable: SELECT _documents_.age AS myage, _documents_.name AS myname FROM _documents_ LIMIT ? OFFSET ?
{'myage': 0.04379398878146823, 'myname': 'user_0'}
I bind the .min, .max, .count, .sum, and .mode methods to the column objects. Additionally, I move the .count method to a separate DocTable2 method.
table.select_first([table['age'].sum, table['age'].count, table['age']], as_dataclass=False)
DocTable: SELECT sum(_documents_.age) AS sum_1, count(_documents_.age) AS count_1, _documents_.age FROM _documents_ LIMIT ? OFFSET ?
(3.748372290260373, 10, 0.04379398878146823)
# with labels now
dict(table.select_first([table['age'].sum.label('sum'), table['age'].count.label('ct')], as_dataclass=False))
DocTable: SELECT sum(_documents_.age) AS sum, count(_documents_.age) AS ct FROM _documents_ LIMIT ? OFFSET ?
{'sum': 3.748372290260373, 'ct': 10}
These additional arguments have also been provided.
# the limit is obvious - it has been used throughout these examples
table.select(limit=2)
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ LIMIT ? OFFSET ?
[Record(id=1, name='user_0', age=0.04379398878146823, is_old=False), Record(id=2, name='user_1', age=0.32083938549074387, is_old=False)]
# orderby clause
table.select(orderby=table['age'].desc(), limit=2)
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ ORDER BY _documents_.age DESC LIMIT ? OFFSET ?
[Record(id=8, name='user_7', age=0.7809152967349511, is_old=True), Record(id=6, name='user_5', age=0.6081281581554809, is_old=True)]
# compound orderby
table.select(orderby=(table['age'].desc(),table['is_old'].asc()), limit=2)
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ ORDER BY _documents_.age DESC, _documents_.is_old ASC LIMIT ? OFFSET ?
[Record(id=8, name='user_7', age=0.7809152967349511, is_old=True), Record(id=6, name='user_5', age=0.6081281581554809, is_old=True)]
# can also use column name directly
# can only use ascending and can use only one col
table.select(orderby='age', limit=2)
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ ORDER BY _documents_.age LIMIT ? OFFSET ?
[Record(id=1, name='user_0', age=0.04379398878146823, is_old=False), Record(id=3, name='user_2', age=0.10213129577978541, is_old=False)]
# groupby clause
# returns first row of each group without any aggregation functions
table.select(groupby=table['is_old'])
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ GROUP BY _documents_.is_old
[Record(id=1, name='user_0', age=0.04379398878146823, is_old=False), Record(id=4, name='user_3', age=0.5649107128684847, is_old=True)]
# compound groupby (weird example bc name is unique - have only one cat var in this demo)
table.select(groupby=(table['is_old'],table['name']), limit=3)
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ GROUP BY _documents_.is_old, _documents_.name LIMIT ? OFFSET ?
[Record(id=1, name='user_0', age=0.04379398878146823, is_old=False), Record(id=2, name='user_1', age=0.32083938549074387, is_old=False), Record(id=3, name='user_2', age=0.10213129577978541, is_old=False)]
# groupby clause using max aggregation function
# gets match age for both old and young groups
table.select(table['age'].max, groupby=table['is_old'])
DocTable: SELECT max(_documents_.age) AS max_1 FROM _documents_ GROUP BY _documents_.is_old
[0.44776169274293975, 0.7809152967349511]
For cases where DocTable2 does not provide a convenient interface, you may submit raw SQL commands. These may be a bit more unwieldly, but they offer maximum flexibility. They may be used either as simply an addition to the WHERE or arbitrary end clauses, or accessed in totality.
qstr = 'SELECT age,name FROM {} WHERE id=="{}"'.format(table.tabname, 1)
results = table.execute(qstr)
dict(list(results)[0])
DocTable: SELECT age,name FROM _documents_ WHERE id=="1"
{'age': 0.04379398878146823, 'name': 'user_0'}
wherestr = 'is_old=="{}"'.format('1')
table.select(wherestr=wherestr, limit=2)
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ WHERE (is_old=="1") LIMIT ? OFFSET ?
[Record(id=4, name='user_3', age=0.5649107128684847, is_old=True), Record(id=6, name='user_5', age=0.6081281581554809, is_old=True)]
# combine whrstr with structured query where clause
wherestr = 'is_old=="{}"'.format('1')
table.select(where=table['id']<=5, wherestr=wherestr)
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ WHERE _documents_.id <= ? AND (is_old=="1")
[Record(id=4, name='user_3', age=0.5649107128684847, is_old=True)]
# combine whrstr with structured query where clause
wherestr = 'is_old=="{}"'.format('1')
table.select(where=table['id']<=5, wherestr=wherestr)
DocTable: SELECT _documents_.id, _documents_.name, _documents_.age, _documents_.is_old FROM _documents_ WHERE _documents_.id <= ? AND (is_old=="1")
[Record(id=4, name='user_3', age=0.5649107128684847, is_old=True)]
.count()
is a convenience method. Mostly the same could be accomplished by db.select_first(db['id'].count)
, but this requires no reference to a specific column.
.next_id()
is especially useful if one hopes to enter the id (or any primary key column) into new rows manually. Especially useful because SQL engines don't provide new ids except when a single insert is performed.
table.count()
DocTable: SELECT count() AS count_1 FROM _documents_ LIMIT ? OFFSET ?
10
table.count(table['age'] < 0.5)
DocTable: SELECT count() AS count_1 FROM _documents_ WHERE _documents_.age < ? LIMIT ? OFFSET ?
7
These are especially useful when working with metadata because Pandas provides robust descriptive and plotting features than SQL alone. Good for generating sample information.
# must provide only a single column
table.select_series(table['age']).head(2)
DocTable: SELECT _documents_.age FROM _documents_
0 0.043794 1 0.320839 dtype: float64
table.select_series(table['age']).quantile([0.025, 0.985])
DocTable: SELECT _documents_.age FROM _documents_
0.025 0.056920 0.985 0.757589 dtype: float64
table.select_df(['id','age']).head(2)
DocTable: SELECT _documents_.id, _documents_.age FROM _documents_
id | age | |
---|---|---|
0 | 1 | 0.043794 |
1 | 2 | 0.320839 |
table.select_df('age').head(2)
DocTable: SELECT _documents_.age FROM _documents_
age | |
---|---|
0 | 0.043794 |
1 | 0.320839 |
# must provide list of cols (even for one col)
table.select_df([table['id'],table['age']]).corr()
DocTable: SELECT _documents_.id, _documents_.age FROM _documents_
id | age | |
---|---|---|
id | 1.00000 | 0.43105 |
age | 0.43105 | 1.00000 |
table.select_df([table['id'],table['age']]).describe().T
DocTable: SELECT _documents_.id, _documents_.age FROM _documents_
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
id | 10.0 | 5.500000 | 3.027650 | 1.000000 | 3.250000 | 5.500000 | 7.750000 | 10.000000 |
age | 10.0 | 0.374837 | 0.237944 | 0.043794 | 0.183425 | 0.376265 | 0.535623 | 0.780915 |
mean_age = table.select_series(table['age']).mean()
df = table.select_df([table['id'],table['age']])
df['old_grp'] = df['age'] > mean_age
df.groupby('old_grp').describe()
DocTable: SELECT _documents_.age FROM _documents_ DocTable: SELECT _documents_.id, _documents_.age FROM _documents_
id | age | |||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
old_grp | ||||||||||||||||
False | 5.0 | 4.0 | 3.162278 | 1.0 | 2.0 | 3.0 | 5.0 | 9.0 | 5.0 | 0.182993 | 0.124028 | 0.043794 | 0.102131 | 0.142750 | 0.305451 | 0.320839 |
True | 5.0 | 7.0 | 2.236068 | 4.0 | 6.0 | 7.0 | 8.0 | 10.0 | 5.0 | 0.566681 | 0.141405 | 0.431691 | 0.447762 | 0.564911 | 0.608128 | 0.780915 |
# more complicated groupby aggregation.
# calculates the variance both for entries above and below average age
mean_age = table.select_series(table['age']).mean()
df = table.select_df([table['name'],table['age']])
df['old_grp'] = df['age']>mean_age
df.groupby('old_grp').agg(**{
'first_name':pd.NamedAgg(column='name', aggfunc='first'),
'var_age':pd.NamedAgg(column='age', aggfunc=np.var),
})
DocTable: SELECT _documents_.age FROM _documents_ DocTable: SELECT _documents_.name, _documents_.age FROM _documents_
first_name | var_age | |
---|---|---|
old_grp | ||
False | user_0 | 0.015383 |
True | user_3 | 0.019995 |
In cases where you have many rows or each row contains a lot of data, you may want to perform a select query which makes requests in chunks. This is performed using the SQL OFFSET command, and querying up to buffsize while yielding each returned row. This system is designed this way because the underlying sql engine buffers all rows retreived from a query, and thus there is no way to stream data into memory without this system.
NOTE: The limit keyword is incompatible with this method - it will return all results. A workaround is to use the approx_max_rows param, which will return at minimum this number of rows, at max the specified number of rows plus buffsize.
for row_chunk in table.select_chunks(chunksize=2, where=(table['id']%2)==0, verbose=False):
print(row_chunk)
[Record(id=2, name='user_1', age=0.32083938549074387, is_old=False), Record(id=4, name='user_3', age=0.5649107128684847, is_old=True)] [Record(id=6, name='user_5', age=0.6081281581554809, is_old=True), Record(id=8, name='user_7', age=0.7809152967349511, is_old=True)] [Record(id=10, name='user_9', age=0.431690999080828, is_old=False)] []