Qrlew as a query Translator#

View On GitHub Open In Colab

In this notebook, we apply qrlew to translate SQL queries from one dialect e.g. PostgresSql to another such as MsSQL.

Install and run the Microsoft SQL Server#

%%capture
# Install the mssql-server 2017 (Not working on colab)
!curl https://packages.microsoft.com/keys/microsoft.asc | sudo tee /etc/apt/trusted.gpg.d/microsoft.asc
!curl https://packages.microsoft.com/config/ubuntu/$(lsb_release -rs)/prod.list | sudo tee /etc/apt/sources.list.d/mssql-release.list
!sudo apt-get update
!sudo ACCEPT_EULA=Y apt-get install -y msodbcsql18
!sudo ACCEPT_EULA=Y apt-get install -y mssql-tools18
!sudo apt-get install -y unixodbc-dev

# installing mssql-server
!curl -fsSL https://packages.microsoft.com/keys/microsoft.asc | sudo gpg --dearmor -o /usr/share/keyrings/microsoft-prod.gpg
!curl https://packages.microsoft.com/keys/microsoft.asc | sudo tee /etc/apt/trusted.gpg.d/microsoft.asc
!curl -fsSL https://packages.microsoft.com/config/ubuntu/22.04/mssql-server-2022.list | sudo tee /etc/apt/sources.list.d/mssql-server-2022.list
!sudo apt-get update
!sudo apt-get install -y mssql-server

# Configure and run the server manually. This is a workaround to install the MSSQL server in a colab
%env MSSQL_SA_PASSWORD="MyStrongPass$"
!sudo /opt/mssql/bin/mssql-conf -n setup accept-eula
!nohup /opt/mssql/bin/sqlservr &
# give time to the server to start and check that it started correctly
import time
time.sleep(10)
!/opt/mssql-tools18/bin/sqlcmd -S localhost -U SA -P $MSSQL_SA_PASSWORD -C -Q "SELECT 1"
           
-----------
          1

(1 rows affected)
%%capture
!pip install -U pyqrlew matplotlib graphviz sqlalchemy pyodbc

For the purpose of this demonstration, we focus on a single table. We create the census_schema and census_table and we fill it with values.

sql_script = """
CREATE SCHEMA census_schema;
GO
CREATE TABLE census_schema.census_table (
    id INT IDENTITY(1,1)  PRIMARY KEY,
    first_name VARCHAR(50),
    last_name VARCHAR(50),
    age INT,
    gender VARCHAR(10),
    occupation VARCHAR(50),
    city VARCHAR(50),
    state VARCHAR(50)
);
GO
-- Inserting sample data
INSERT INTO census_schema.census_table (first_name, last_name, age, gender, occupation, city, state)
VALUES
    ('John', 'Doe', 30, 'Male', 'Engineer', 'New York', 'NY'),
    ('Jane', 'Doe', 28, 'Female', 'Engineer', 'Los Angeles', 'CA'),
    ('Bob', 'Smith', 45, 'Male', 'Analyst', 'Chicago', 'IL'),
    ('Alice', 'Johnson', 35, 'Female', 'Data Scientist', 'San Francisco', 'CA'),
    ('Tom', 'Brown', 50, 'Male', 'Lawyer', 'Miami', 'FL'),
    ('Emily', 'Davis', 32, 'Female', 'Software Developer', 'Seattle', 'WA'),
    ('Charlie', 'Miller', 40, 'Male', 'Accountant', 'Boston', 'MA'),
    ('Eva', 'Williams', 29, 'Female', 'Marketing Manager', 'Austin', 'TX'),
    ('Mike', 'Jones', 38, 'Male', 'Analyst', 'Denver', 'CO'),
    ('Sophia', 'Taylor', 42, 'Female', 'Analyst', 'Atlanta', 'GA'),
    ('John', 'Doe', 30, 'Male', 'Engineer', 'New York', 'NY'),
    ('Jane', 'Doe', 28, 'Female', 'Engineer', 'Los Angeles', 'CA'),
    ('Bob', 'Smith', 45, 'Male', 'Analyst', 'Chicago', 'IL'),
    ('Alice', 'Johnson', 35, 'Female', 'Data Scientist', 'San Francisco', 'CA'),
    ('Tom', 'Brown', 50, 'Male', 'Lawyer', 'Miami', 'FL'),
    ('Emily', 'Davis', 32, 'Female', 'Software Developer', 'Seattle', 'WA'),
    ('Charlie', 'Miller', 40, 'Male', 'Accountant', 'Boston', 'MA'),
    ('Eva', 'Williams', 29, 'Female', 'Marketing Manager', 'Austin', 'TX'),
    ('Mike', 'Jones', 38, 'Male', 'Analyst', 'Denver', 'CO'),
    ('Sophia', 'Taylor', 42, 'Female', 'Analyst', 'Atlanta', 'GA'),
    ('Alex', 'Johnson', 33, 'Male', 'Analyst', 'Portland', 'OR'),
    ('Megan', 'White', 27, 'Female', 'Data Scientist', 'San Diego', 'CA'),
    ('Daniel', 'Clark', 36, 'Male', 'Analyst', 'Houston', 'TX'),
    ('Olivia', 'Martinez', 31, 'Female', 'Data Scientist', 'Phoenix', 'AZ'),
    ('William', 'Lee', 48, 'Male', 'Professor', 'Philadelphia', 'PA'),
    ('Emma', 'Garcia', 29, 'Female', 'Data Scientist', 'Minneapolis', 'MN'),
    ('Ryan', 'Harris', 41, 'Male', 'Data Scientist', 'Dallas', 'TX'),
    ('Ava', 'Taylor', 26, 'Female', 'Product Manager', 'San Jose', 'CA'),
    ('Christopher', 'Anderson', 37, 'Male', 'Engineer', 'Detroit', 'MI'),
    ('Sophie', 'Wilson', 34, 'Female', 'HR Specialist', 'Raleigh', 'NC'),
    ('David', 'Miller', 29, 'Male', 'Engineer', 'New York', 'NY'),
    ('Sophia', 'Taylor', 34, 'Female', 'Engineer', 'Los Angeles', 'CA'),
    ('Bob', 'Smith', 42, 'Male', 'Analyst', 'Chicago', 'IL'),
    ('Alice', 'Johnson', 39, 'Female', 'Data Scientist', 'San Francisco', 'CA'),
    ('Tom', 'Brown', 37, 'Male', 'Lawyer', 'Miami', 'FL'),
    ('Emily', 'Davis', 28, 'Female', 'Software Developer', 'Seattle', 'WA'),
    ('Charlie', 'Miller', 45, 'Male', 'Accountant', 'Boston', 'MA'),
    ('Eva', 'Williams', 31, 'Female', 'Marketing Manager', 'Austin', 'TX'),
    ('Mike', 'Jones', 36, 'Male', 'Analyst', 'Denver', 'CO'),
    ('Sophia', 'Taylor', 49, 'Female', 'Analyst', 'Atlanta', 'GA'),
    ('Alex', 'Johnson', 37, 'Male', 'Analyst', 'Portland', 'OR'),
    ('Megan', 'White', 27, 'Female', 'Data Scientist', 'San Diego', 'CA'),
    ('Daniel', 'Clark', 41, 'Male', 'Analyst', 'Houston', 'TX'),
    ('Olivia', 'Martinez', 32, 'Female', 'Data Scientist', 'Phoenix', 'AZ'),
    ('William', 'Lee', 47, 'Male', 'Professor', 'Philadelphia', 'PA'),
    ('Emma', 'Garcia', 30, 'Female', 'Data Scientist', 'Minneapolis', 'MN'),
    ('Ryan', 'Harris', 40, 'Male', 'Entrepreneur', 'Dallas', 'TX'),
    ('Ava', 'Taylor', 26, 'Female', 'Product Manager', 'San Jose', 'CA'),
    ('Christopher', 'Anderson', 35, 'Male', 'Engineer', 'Detroit', 'MI'),
    ('Sophie', 'Wilson', 33, 'Female', 'HR Specialist', 'Raleigh', 'NC'),
    ('Grace', 'Brown', 55, 'Female', 'Analyst', 'New York', 'NY'),
    ('Brian', 'Taylor', 50, 'Male', 'Professor', 'Los Angeles', 'CA'),
    ('Lily', 'Harrison', 42, 'Female', 'Engineer', 'Chicago', 'IL'),
    ('Kevin', 'Garcia', 55, 'Male', 'Analyst', 'San Francisco', 'CA'),
    ('Sophie', 'Smithson', 48, 'Female', 'Lawyer', 'Miami', 'FL'),
    ('John', 'Clarkston', 38, 'Male', 'Software Developer', 'Seattle', 'WA'),
    ('Emma', 'Millerson', 40, 'Female', 'Accountant', 'Boston', 'MA'),
    ('David', 'Williamson', 45, 'Male', 'Marketing Manager', 'Austin', 'TX'),
    ('Ava', 'Johnsonson', 38, 'Female', 'Analyst', 'Denver', 'CO'),
    ('Daniel', 'Anderson', 42, 'Male', 'Data Scientist', 'Atlanta', 'GA'),
    ('Michael', 'Martinson', 33, 'Male', 'Engineer', 'Portland', 'OR'),
    ('Sophia', 'Whiterson', 27, 'Female', 'Data Scientist', 'San Diego', 'CA'),
    ('Robert', 'Davidson', 41, 'Male', 'Analyst', 'Houston', 'TX'),
    ('Grace', 'Wilson', 45, 'Female', 'Data Scientist', 'Phoenix', 'AZ'),
    ('Andrew', 'Leeson', 48, 'Male', 'Professor', 'Philadelphia', 'PA'),
    ('Emily', 'Garcia', 39, 'Female', 'Data Scientist', 'Minneapolis', 'MN'),
    ('Christopher', 'Sanders', 41, 'Male', 'Entrepreneur', 'Dallas', 'TX'),
    ('Olivia', 'Martinez', 36, 'Female', 'Product Manager', 'San Jose', 'CA'),
    ('Sophie', 'Brownson', 43, 'Female', 'Engineer', 'Detroit', 'MI'),
    ('Matthew', 'Taylors', 46, 'Male', 'HR Specialist', 'Raleigh', 'NC');
GO

"""
with open('sql_script.sql', 'w') as f:
  f.write(sql_script)

!/opt/mssql-tools18/bin/sqlcmd -S localhost -U SA -P $MSSQL_SA_PASSWORD -C -i sql_script.sql
(70 rows affected)

Use qrlew on the database#

At this stage, we have a mssql dataset that can be connected with qrlew.

from urllib.parse import quote_plus
from sqlalchemy import create_engine
import os
# creating the sqlalchemy engine
MSSQL_DRIVER = "{ODBC Driver 18 for SQL Server}"
MSSQL_SERVER = "localhost"
MSSQL_DATABASE = "master"
MSSQL_USERNAME = "sa"
MSSQL_PASSWORD = os.environ.get('MSSQL_SA_PASSWORD')
ADVANCED_PARAMETERS = "Encrypt=yes;TrustServerCertificate=yes;Connection Timeout=30;"
MSSQL_URI = "mssql+pyodbc:///?odbc_connect={}".format(
    quote_plus(
        (
            f"Driver={MSSQL_DRIVER};Server=tcp:{MSSQL_SERVER};"
            f"Database={MSSQL_DATABASE};Uid={MSSQL_USERNAME};"
            f"Pwd={MSSQL_PASSWORD};{ADVANCED_PARAMETERS}"
        )
    )
)
print(MSSQL_URI)
engine = create_engine(MSSQL_URI)
mssql+pyodbc:///?odbc_connect=Driver%3D%7BODBC+Driver+18+for+SQL+Server%7D%3BServer%3Dtcp%3Alocalhost%3BDatabase%3Dmaster%3BUid%3Dsa%3BPwd%3D%22MyStrongPass%24%22%3BEncrypt%3Dyes%3BTrustServerCertificate%3Dyes%3BConnection+Timeout%3D30%3B
# another sanity check with sqlalchemy
with engine.connect() as conn:
  res = conn.execute("SELECT 1")
  print(res.fetchall())
[(1,)]
<ipython-input-7-0ac7eb6b049f>:3: RemovedIn20Warning: Deprecated API features detected! These feature(s) are not compatible with SQLAlchemy 2.0. To prevent incompatible upgrades prior to updating applications, ensure requirements files are pinned to "sqlalchemy<2.0". Set environment variable SQLALCHEMY_WARN_20=1 to show all deprecation warnings.  Set environment variable SQLALCHEMY_SILENCE_UBER_WARNING=1 to silence this message. (Background on SQLAlchemy 2.0 at: https://sqlalche.me/e/b8d9)
  res = conn.execute("SELECT 1")

The dataset_from_database function, located in pyqrlew.io.dataset, enables the creation of a Dataset object from a SQLAlchemy engine. It takes the following parameters:

  • name: the dataset’s name,

  • engine: the SQLAlchemy engine establishing a connection to the database,

  • schema_name: the optional name of the schema,

  • ranges: if set to True, we use the the minimum and maximum values of numeric and datetime columns for setting their bounds. This may violates differential privacy and should be use if these values are public. Otherwise, use False. In that case, the bounds have to be provided in the SQL query,

  • possible_values_threshold: an optional integer that serves as a threshold for identifying a column as categorical. If a column surpasses this threshold, its categories are published. Similar to the ranges parameter, exercising this option may compromise differential privacy.

from pyqrlew.io.dataset import dataset_from_database
from pyqrlew import Dialect

ds = dataset_from_database(
    name="census_table",
    engine=engine,
    schema_name="census_schema",
    ranges=True,
)

qrlew connects to the database via the SQLAlchemy engine and fetch the tables.

The table "census_schema"."census_table" is represented as a Table variant of a Relation. This specific representation encapsulates all columns, their respective types, and optionally includes information about bounds and possible values.

In the provided example, we observe the bounds specification for the integer column age and the declaration of possible values for the string column gender.

import graphviz
display(graphviz.Source(ds.census_schema.census_table.relation().dot()))
../_images/0c83f4c1b42ef9346ab1c852734053f467b8498b667769d9a1f09d250ab78d46.svg

Any SQL query can be transformed into a Relation, a qrlew object which allows easly for query manipulation such as query translation to another dialect.

import pandas as pd
from pyqrlew.utils import print_query
# write a PostgreSql query: LIMIT is not allowed in MsSql but it becomes TOP
psql_query = "SELECT * FROM census_schema.census_table LIMIT 10"
relation = ds.relation(psql_query)
# we need to pass the MsSql dialect to to_query relation method
mssql_query = relation.to_query(Dialect.MsSql)
print_query(mssql_query)
# execute
results = pd.read_sql(mssql_query, engine)
results
WITH
  map_u3eo (id, first_name, last_name, age, gender, occupation, city, state) AS (SELECT "id" AS id, "first_name" AS first_name, "last_name" AS last_name, "age" AS age, "gender" AS gender, "occupation" AS occupation, "city" AS city, "state" AS state FROM "census_schema"."census_table"),
  map_1_fm (id, first_name, last_name, age, gender, occupation, city, state) AS (SELECT TOP (10) "id" AS id, "first_name" AS first_name, "last_name" AS last_name, "age" AS age, "gender" AS gender, "occupation" AS occupation, "city" AS city, "state" AS state FROM "map_u3eo")
SELECT TOP (10) * FROM "map_1_fm"
id first_name last_name age gender occupation city state
0 1 John Doe 30 Male Engineer New York NY
1 2 Jane Doe 28 Female Engineer Los Angeles CA
2 3 Bob Smith 45 Male Analyst Chicago IL
3 4 Alice Johnson 35 Female Data Scientist San Francisco CA
4 5 Tom Brown 50 Male Lawyer Miami FL
5 6 Emily Davis 32 Female Software Developer Seattle WA
6 7 Charlie Miller 40 Male Accountant Boston MA
7 8 Eva Williams 29 Female Marketing Manager Austin TX
8 9 Mike Jones 38 Male Analyst Denver CO
9 10 Sophia Taylor 42 Female Analyst Atlanta GA

There are serveral differences between PostgreSql and MsSql dialects: LIMIT becomes TOP in the SELECT quantifier, LN which is natural log in PostgreSql becomes LOG in MsSql, MD5(col) PostgreSql function is tranlated as CONVERT(VARCHAR(MAX), HASHBYTES('MD5', col, 2)) to name a few.

psql_query = "SELECT age, LN(age) AS nat_log_age, MD5(CONCAT(first_name, last_name)) AS hashed_name FROM census_schema.census_table LIMIT 10"
relation = ds.relation(psql_query)

mssql_query = relation.to_query(Dialect.MsSql)
print_query(mssql_query)

results = pd.read_sql(mssql_query, engine)
results
WITH
  map_3l7g (age, nat_log_age, hashed_name) AS (SELECT "age" AS age, LOG("age") AS nat_log_age, CONVERT(VARCHAR(MAX),
  HASHBYTES('MD5', CONCAT("first_name", "last_name")),
  2) AS hashed_name FROM "census_schema"."census_table"),
  map_3_1r (age, nat_log_age, hashed_name) AS (SELECT TOP (10) "age" AS age, "nat_log_age" AS nat_log_age, "hashed_name" AS hashed_name FROM "map_3l7g")
SELECT TOP (10) * FROM "map_3_1r"
age nat_log_age hashed_name
0 30 3.401197 9FD9F63E0D6487537569075DA85A0C7F
1 28 3.332205 1E218214D6994742C9D543F8FBB5EE10
2 45 3.806662 103891BACA2751A856B094DB796E3FEE
3 35 3.555348 62360D09B6561E319B76DA8AE91DD526
4 50 3.912023 5306CF440C6634E13DBF38DE4DFDA04E
5 32 3.465736 D7E90AD028AFE50F5A0AEB8A7A6818A1
6 40 3.688879 83D1145CCB93A34EBBDBD410C3F9A5CB
7 29 3.367296 5436CDEC603390DE2E6C097FA7EBD065
8 38 3.637586 FC2BA1CF41ADAA8605FA0362C2E60AF1
9 42 3.737670 0E7479482648F5E1013A27599B3F9648