Creating parsetrees with spacy can be a computationally expensive task, so we may often want to store them in a database for better use. Because they may be large binary files, we will store them as pickle file column types, but with an additional serialization step.
import spacy
nlp = spacy.load('en_core_web_sm')
import numpy as np
from pathlib import Path
import sys
sys.path.append('..')
import doctable
# automatically clean up temp folder after python ends
tmpfolder = doctable.TempFolder('tmp')
Create some test data and make a new ParseTreeDoc
object.
texts = [
'Help me Obi-Wan Kenobi. You’re my only hope. ',
'I find your lack of faith disturbing. ',
'Do, or do not. There is no try. '
]
parser = doctable.ParsePipeline([nlp, doctable.Comp('get_parsetrees')])
docs = parser.parsemany(texts)
for doc in docs:
print(len(doc))
2 1 2
Now we create a schema that includes the doc
column and the ParseTreeFileCol
default value. Notice that using the type hint ParseTreeDoc
and giving a generic Col
default value is also sufficient.
import dataclasses
@doctable.schema(require_slots=False)
class DocRow:
id: int = doctable.IDCol()
doc: doctable.ParseTreeDoc = doctable.ParseTreeFileCol('tmp/parsetree_pickle_files')
# could also use this:
#doc: doctable.ParseTreeDoc = doctable.Col(type_args=dict(folder='tmp/parsetree_pickle_files'))
db = doctable.DocTable(target='tmp/test_ptrees.db', schema=DocRow, new_db=True)
db.schema_info()
[{'name': 'id', 'type': INTEGER(), 'nullable': False, 'default': None, 'autoincrement': 'auto', 'primary_key': 1}, {'name': 'doc', 'type': VARCHAR(), 'nullable': True, 'default': None, 'autoincrement': 'auto', 'primary_key': 0}]
#db.insert([{'doc':doc} for doc in docs])
for doc in docs:
db.insert({'doc': doc})
db.head(3)
id | doc | |
---|---|---|
0 | 1 | [(Help, me, Obi, -, Wan, Kenobi, .), (You, ’re... |
1 | 2 | [(I, find, your, lack, of, faith, disturbing, .)] |
2 | 3 | [(Do, ,, or, do, not, .), (There, is, no, try,... |
for idx, doc in db.select(as_dataclass=False):
print(f"doc id {idx}:")
for i, sent in enumerate(doc):
print(f"\tsent {i}: {[t.text for t in sent]}")
doc id 1: sent 0: ['Help', 'me', 'Obi', '-', 'Wan', 'Kenobi', '.'] sent 1: ['You', '’re', 'my', 'only', 'hope', '.'] doc id 2: sent 0: ['I', 'find', 'your', 'lack', 'of', 'faith', 'disturbing', '.'] doc id 3: sent 0: ['Do', ',', 'or', 'do', 'not', '.'] sent 1: ['There', 'is', 'no', 'try', '.']
See that the files exist, and we can remove/clean them just as any other file column type.
for fpath in tmpfolder.path.rglob('*.pic'):
print(str(fpath))
tmp/parsetree_pickle_files/957205257663_parsetreedoc.pic tmp/parsetree_pickle_files/331764742454_parsetreedoc.pic tmp/parsetree_pickle_files/348766251477_parsetreedoc.pic
db.delete(db['id']==1)
for fpath in tmpfolder.path.rglob('*.pic'):
print(str(fpath))
db.head()
tmp/parsetree_pickle_files/957205257663_parsetreedoc.pic tmp/parsetree_pickle_files/331764742454_parsetreedoc.pic tmp/parsetree_pickle_files/348766251477_parsetreedoc.pic
id | doc | |
---|---|---|
0 | 2 | [(I, find, your, lack, of, faith, disturbing, .)] |
1 | 3 | [(Do, ,, or, do, not, .), (There, is, no, try,... |
db.clean_col_files('doc')
for fpath in tmpfolder.path.rglob('*.pic'):
print(str(fpath))
tmp/parsetree_pickle_files/957205257663_parsetreedoc.pic tmp/parsetree_pickle_files/348766251477_parsetreedoc.pic