Datasets and Relations#

View On GitHub Open In Colab

In this short tutorial, we setup a local database, insert a demo dataset from qrlew-dataset and show how to create Relations from SQL queries and derived Datasets (set of Relations).

Setup a local database#

%%capture
# Load the database
# Inspired by https://colab.research.google.com/github/tensorflow/io/blob/master/docs/tutorials/postgresql.ipynb#scrollTo=YUj0878jPyz7
!sudo apt-get -y -qq update
!sudo apt-get -y -qq install postgresql-14 graphviz
# Start postgresql server
!sudo sed -i "s/port = 5432/port = 5433/g" /etc/postgresql/14/main/postgresql.conf
!sudo service postgresql start
# Set password
!sudo -u postgres psql -U postgres -c "ALTER USER postgres PASSWORD 'pyqrlew-db'"
!pip install -U pyqrlew matplotlib graphviz

Insert data#

import pyqrlew as qrl
from pyqrlew.io import PostgreSQL

# Setup a default database
DB = PostgreSQL()
# Insert a demo dataset
DB.load_extract()
dataset = qrl.Dataset.from_database(name='extract', engine=DB.engine(), schema_name='extract')
from graphviz import Source
from IPython.display import display

for path, relation in dataset.relations():
    print(path)
    print(relation.to_query())
    display(Source(relation.dot()))
['extract', 'extract', 'beacon']
SELECT * FROM extract.beacon
../_images/862d44531291ad693d51b678032d2f33c27c2c70e92a0708505033477b15f6fc.svg
['extract', 'extract', 'census']
SELECT * FROM extract.census
../_images/09878664e391d8a0890ebd974b49d097de7cc6c5069776287716089331cbcb9a.svg

Build Relations and Datasets#

derived_relation = dataset.relation('SELECT age, count(age) FROM extract.census GROUP BY age')
Source(derived_relation.dot())
../_images/3a464305fa27b320c6a45da87ee489ece1bd560209ce523acc3a987a1bcedf1e.svg
queries = [
    (("schema_name", "sch", "tab1"), 'SELECT age, count(age) FROM extract.census GROUP BY age LIMIT 10'),
    (("schema_name", "sch", "sum_age"), 'SELECT SUM(age) FROM extract.census'),
    (("schema_name", "new_sch", "bacon"), 'SELECT * FROM extract.beacon')
]
derived_dataset = dataset.from_queries(queries)
print(derived_dataset)
for path, relation in derived_dataset.relations():
    print(path)
    print(relation.to_query())
    display(Source(relation.dot()))
Dataset: {}
Schema: {"name": "schema_name", "type": {"name": "Struct", "struct": {"fields": [{"name": "sarus_data", "type": {"name": "Union", "union": {"fields": [{"name": "new_sch", "type": {"name": "Union", "union": {"fields": [{"name": "bacon", "type": {"name": "Struct", "struct": {"fields": [{"name": "検知日時", "type": {"name": "Datetime", "datetime": {"format": "%Y-%m-%d %H:%M:%S.%9f", "min": "0001-01-01 00:00:00.000000000", "max": "9999-12-31 00:00:00.000000000"}}}, {"name": "UserId", "type": {"name": "Text", "text": {}}}, {"name": "所属部署", "type": {"name": "Text", "text": {}}}, {"name": "フロア名", "type": {"name": "Text", "text": {}}}, {"name": "Beacon名", "type": {"name": "Text", "text": {}}}, {"name": "RSSI", "type": {"name": "Integer", "integer": {"min": "-9223372036854775808", "max": "9223372036854775807"}}}, {"name": "マップのX座標", "type": {"name": "Integer", "integer": {"min": "-9223372036854775808", "max": "9223372036854775807"}}}, {"name": "マップのY座標", "type": {"name": "Integer", "integer": {"min": "-9223372036854775808", "max": "9223372036854775807"}}}]}}}]}}}, {"name": "sch", "type": {"name": "Union", "union": {"fields": [{"name": "tab1", "type": {"name": "Struct", "struct": {"fields": [{"name": "age", "type": {"name": "Integer", "integer": {"min": "-9223372036854775808", "max": "9223372036854775807"}, "properties": {"_CONSTRAINT_": "_UNIQUE_"}}}, {"name": "field_fxi8", "type": {"name": "Integer", "integer": {"max": "199"}}}]}}}, {"name": "sum_age", "type": {"name": "Struct", "struct": {"fields": [{"name": "field_0_go", "type": {"name": "Integer", "integer": {"min": "-9223372036854775808", "max": "9223372036854775807"}}}]}}}]}}}]}}}, {"name": "sarus_protected_entity", "type": {"name": "Optional", "optional": {"type": {"name": "Id", "id": {}}}}}, {"name": "sarus_is_public", "type": {"name": "Boolean", "boolean": {}}}, {"name": "sarus_weights", "type": {"name": "Float", "float": {"max": 1.7976931348623157e308}}}]}}}
Size: {"statistics": {"name": "Union", "union": {"fields": [{"name": "sch", "statistics": {"name": "Union", "union": {"fields": [{"name": "sum_age", "statistics": {"struct": {"size": "199"}}}, {"name": "tab1", "statistics": {"struct": {"size": "10"}}}]}}}, {"name": "new_sch", "statistics": {"name": "Union", "union": {"fields": [{"name": "bacon", "statistics": {"struct": {"size": "100"}}}]}}}]}}}
['schema_name', 'new_sch', 'bacon']
SELECT * FROM new_sch.bacon
../_images/03176e651ee31a63fbc68db6daba38354ad526dd9ba2a4bda0bfbf2e3280a576.svg
['schema_name', 'sch', 'sum_age']
SELECT * FROM sch.sum_age
../_images/d885f3b1de35f7f3c7585d772e5e3c9a9fd23a8eac396d457bddae9440c6dd1c.svg
['schema_name', 'sch', 'tab1']
SELECT * FROM sch.tab1
../_images/59a7c20ee261a27271ac575bf1f93f0a12690b9211c79fe0120273ce85bd4edb.svg
derived_dataset.schema
'{"name": "schema_name", "type": {"name": "Struct", "struct": {"fields": [{"name": "sarus_data", "type": {"name": "Union", "union": {"fields": [{"name": "new_sch", "type": {"name": "Union", "union": {"fields": [{"name": "bacon", "type": {"name": "Struct", "struct": {"fields": [{"name": "検知日時", "type": {"name": "Datetime", "datetime": {"format": "%Y-%m-%d %H:%M:%S.%9f", "min": "0001-01-01 00:00:00.000000000", "max": "9999-12-31 00:00:00.000000000"}}}, {"name": "UserId", "type": {"name": "Text", "text": {}}}, {"name": "所属部署", "type": {"name": "Text", "text": {}}}, {"name": "フロア名", "type": {"name": "Text", "text": {}}}, {"name": "Beacon名", "type": {"name": "Text", "text": {}}}, {"name": "RSSI", "type": {"name": "Integer", "integer": {"min": "-9223372036854775808", "max": "9223372036854775807"}}}, {"name": "マップのX座標", "type": {"name": "Integer", "integer": {"min": "-9223372036854775808", "max": "9223372036854775807"}}}, {"name": "マップのY座標", "type": {"name": "Integer", "integer": {"min": "-9223372036854775808", "max": "9223372036854775807"}}}]}}}]}}}, {"name": "sch", "type": {"name": "Union", "union": {"fields": [{"name": "tab1", "type": {"name": "Struct", "struct": {"fields": [{"name": "age", "type": {"name": "Integer", "integer": {"min": "-9223372036854775808", "max": "9223372036854775807"}, "properties": {"_CONSTRAINT_": "_UNIQUE_"}}}, {"name": "field_fxi8", "type": {"name": "Integer", "integer": {"max": "199"}}}]}}}, {"name": "sum_age", "type": {"name": "Struct", "struct": {"fields": [{"name": "field_0_go", "type": {"name": "Integer", "integer": {"min": "-9223372036854775808", "max": "9223372036854775807"}}}]}}}]}}}]}}}, {"name": "sarus_protected_entity", "type": {"name": "Optional", "optional": {"type": {"name": "Id", "id": {}}}}}, {"name": "sarus_is_public", "type": {"name": "Boolean", "boolean": {}}}, {"name": "sarus_weights", "type": {"name": "Float", "float": {"max": 1.7976931348623157e308}}}]}}}'
derived_dataset.size
'{"statistics": {"name": "Union", "union": {"fields": [{"name": "sch", "statistics": {"name": "Union", "union": {"fields": [{"name": "sum_age", "statistics": {"struct": {"size": "199"}}}, {"name": "tab1", "statistics": {"struct": {"size": "10"}}}]}}}, {"name": "new_sch", "statistics": {"name": "Union", "union": {"fields": [{"name": "bacon", "statistics": {"struct": {"size": "100"}}}]}}}]}}}'