Simple table test#

View On GitHub Open In Colab

In this notebook, we demonstrate the application of qrlew to an existing database, showcasing its capability to compile SQL queries into differential privacy.

For the purpose of this demonstration, we focus on a single table, while more intricate use cases are explored in other notebooks.

Install the database#

Initially, we establish a PostgreSQL database.

This section maybe skipped if you have already access to a Postgres database.

%%capture
# Load the database
# Inspired by https://colab.research.google.com/github/tensorflow/io/blob/master/docs/tutorials/postgresql.ipynb#scrollTo=YUj0878jPyz7
!sudo apt-get -y -qq update
!sudo apt-get -y -qq install postgresql-14 graphviz
# Start postgresql server
!sudo service postgresql start
# Set password
!sudo -u postgres psql -U postgres -c "ALTER USER postgres PASSWORD 'pyqrlew-db'"
!sudo -u postgres psql -U postgres -c "CREATE DATABASE my_database"

We create the census_table table with its synthetic version synthetic census_table.

Note that the synthetic tables are not mandatory and can be used when `qrlew`` cannot compile into DP the query.

import psycopg2

connect_pars= {
    'dbname':'my_database',
    'user':'postgres',
    'port':5432,
    'password':'pyqrlew-db',
    'host':'localhost',
}
conn = psycopg2.connect(**connect_pars)

# Create a cursor
cursor = conn.cursor()

sql_script = """
CREATE SCHEMA census_schema;

CREATE TABLE census_schema.census_table (
    id SERIAL PRIMARY KEY,
    first_name VARCHAR(50),
    last_name VARCHAR(50),
    age INT,
    gender VARCHAR(10),
    occupation VARCHAR(50),
    city VARCHAR(50),
    state VARCHAR(50)
);

-- Inserting sample data
INSERT INTO census_schema.census_table (first_name, last_name, age, gender, occupation, city, state)
VALUES
    ('John', 'Doe', 30, 'Male', 'Engineer', 'New York', 'NY'),
    ('Jane', 'Doe', 28, 'Female', 'Engineer', 'Los Angeles', 'CA'),
    ('Bob', 'Smith', 45, 'Male', 'Analyst', 'Chicago', 'IL'),
    ('Alice', 'Johnson', 35, 'Female', 'Data Scientist', 'San Francisco', 'CA'),
    ('Tom', 'Brown', 50, 'Male', 'Lawyer', 'Miami', 'FL'),
    ('Emily', 'Davis', 32, 'Female', 'Software Developer', 'Seattle', 'WA'),
    ('Charlie', 'Miller', 40, 'Male', 'Accountant', 'Boston', 'MA'),
    ('Eva', 'Williams', 29, 'Female', 'Marketing Manager', 'Austin', 'TX'),
    ('Mike', 'Jones', 38, 'Male', 'Analyst', 'Denver', 'CO'),
    ('Sophia', 'Taylor', 42, 'Female', 'Analyst', 'Atlanta', 'GA'),
    ('John', 'Doe', 30, 'Male', 'Engineer', 'New York', 'NY'),
    ('Jane', 'Doe', 28, 'Female', 'Engineer', 'Los Angeles', 'CA'),
    ('Bob', 'Smith', 45, 'Male', 'Analyst', 'Chicago', 'IL'),
    ('Alice', 'Johnson', 35, 'Female', 'Data Scientist', 'San Francisco', 'CA'),
    ('Tom', 'Brown', 50, 'Male', 'Lawyer', 'Miami', 'FL'),
    ('Emily', 'Davis', 32, 'Female', 'Software Developer', 'Seattle', 'WA'),
    ('Charlie', 'Miller', 40, 'Male', 'Accountant', 'Boston', 'MA'),
    ('Eva', 'Williams', 29, 'Female', 'Marketing Manager', 'Austin', 'TX'),
    ('Mike', 'Jones', 38, 'Male', 'Analyst', 'Denver', 'CO'),
    ('Sophia', 'Taylor', 42, 'Female', 'Analyst', 'Atlanta', 'GA'),
    ('Alex', 'Johnson', 33, 'Male', 'Analyst', 'Portland', 'OR'),
    ('Megan', 'White', 27, 'Female', 'Data Scientist', 'San Diego', 'CA'),
    ('Daniel', 'Clark', 36, 'Male', 'Analyst', 'Houston', 'TX'),
    ('Olivia', 'Martinez', 31, 'Female', 'Data Scientist', 'Phoenix', 'AZ'),
    ('William', 'Lee', 48, 'Male', 'Professor', 'Philadelphia', 'PA'),
    ('Emma', 'Garcia', 29, 'Female', 'Data Scientist', 'Minneapolis', 'MN'),
    ('Ryan', 'Harris', 41, 'Male', 'Data Scientist', 'Dallas', 'TX'),
    ('Ava', 'Taylor', 26, 'Female', 'Product Manager', 'San Jose', 'CA'),
    ('Christopher', 'Anderson', 37, 'Male', 'Engineer', 'Detroit', 'MI'),
    ('Sophie', 'Wilson', 34, 'Female', 'HR Specialist', 'Raleigh', 'NC'),
    ('David', 'Miller', 29, 'Male', 'Engineer', 'New York', 'NY'),
    ('Sophia', 'Taylor', 34, 'Female', 'Engineer', 'Los Angeles', 'CA'),
    ('Bob', 'Smith', 42, 'Male', 'Analyst', 'Chicago', 'IL'),
    ('Alice', 'Johnson', 39, 'Female', 'Data Scientist', 'San Francisco', 'CA'),
    ('Tom', 'Brown', 37, 'Male', 'Lawyer', 'Miami', 'FL'),
    ('Emily', 'Davis', 28, 'Female', 'Software Developer', 'Seattle', 'WA'),
    ('Charlie', 'Miller', 45, 'Male', 'Accountant', 'Boston', 'MA'),
    ('Eva', 'Williams', 31, 'Female', 'Marketing Manager', 'Austin', 'TX'),
    ('Mike', 'Jones', 36, 'Male', 'Analyst', 'Denver', 'CO'),
    ('Sophia', 'Taylor', 49, 'Female', 'Analyst', 'Atlanta', 'GA'),
    ('Alex', 'Johnson', 37, 'Male', 'Analyst', 'Portland', 'OR'),
    ('Megan', 'White', 27, 'Female', 'Data Scientist', 'San Diego', 'CA'),
    ('Daniel', 'Clark', 41, 'Male', 'Analyst', 'Houston', 'TX'),
    ('Olivia', 'Martinez', 32, 'Female', 'Data Scientist', 'Phoenix', 'AZ'),
    ('William', 'Lee', 47, 'Male', 'Professor', 'Philadelphia', 'PA'),
    ('Emma', 'Garcia', 30, 'Female', 'Data Scientist', 'Minneapolis', 'MN'),
    ('Ryan', 'Harris', 40, 'Male', 'Entrepreneur', 'Dallas', 'TX'),
    ('Ava', 'Taylor', 26, 'Female', 'Product Manager', 'San Jose', 'CA'),
    ('Christopher', 'Anderson', 35, 'Male', 'Engineer', 'Detroit', 'MI'),
    ('Sophie', 'Wilson', 33, 'Female', 'HR Specialist', 'Raleigh', 'NC'),
    ('Grace', 'Brown', 55, 'Female', 'Analyst', 'New York', 'NY'),
    ('Brian', 'Taylor', 50, 'Male', 'Professor', 'Los Angeles', 'CA'),
    ('Lily', 'Harrison', 42, 'Female', 'Engineer', 'Chicago', 'IL'),
    ('Kevin', 'Garcia', 55, 'Male', 'Analyst', 'San Francisco', 'CA'),
    ('Sophie', 'Smithson', 48, 'Female', 'Lawyer', 'Miami', 'FL'),
    ('John', 'Clarkston', 38, 'Male', 'Software Developer', 'Seattle', 'WA'),
    ('Emma', 'Millerson', 40, 'Female', 'Accountant', 'Boston', 'MA'),
    ('David', 'Williamson', 45, 'Male', 'Marketing Manager', 'Austin', 'TX'),
    ('Ava', 'Johnsonson', 38, 'Female', 'Analyst', 'Denver', 'CO'),
    ('Daniel', 'Anderson', 42, 'Male', 'Data Scientist', 'Atlanta', 'GA'),
    ('Michael', 'Martinson', 33, 'Male', 'Engineer', 'Portland', 'OR'),
    ('Sophia', 'Whiterson', 27, 'Female', 'Data Scientist', 'San Diego', 'CA'),
    ('Robert', 'Davidson', 41, 'Male', 'Analyst', 'Houston', 'TX'),
    ('Grace', 'Wilson', 45, 'Female', 'Data Scientist', 'Phoenix', 'AZ'),
    ('Andrew', 'Leeson', 48, 'Male', 'Professor', 'Philadelphia', 'PA'),
    ('Emily', 'Garcia', 39, 'Female', 'Data Scientist', 'Minneapolis', 'MN'),
    ('Christopher', 'Sanders', 41, 'Male', 'Entrepreneur', 'Dallas', 'TX'),
    ('Olivia', 'Martinez', 36, 'Female', 'Product Manager', 'San Jose', 'CA'),
    ('Sophie', 'Brownson', 43, 'Female', 'Engineer', 'Detroit', 'MI'),
    ('Matthew', 'Taylors', 46, 'Male', 'HR Specialist', 'Raleigh', 'NC');

CREATE TABLE census_schema.synthetic_census_table (
    id SERIAL PRIMARY KEY,
    first_name VARCHAR(50),
    last_name VARCHAR(50),
    age INT,
    gender VARCHAR(10),
    occupation VARCHAR(50),
    city VARCHAR(50),
    state VARCHAR(50)
);

-- Inserting sample data
INSERT INTO census_schema.synthetic_census_table (first_name, last_name, age, gender, occupation, city, state)
VALUES
    ('Adam', 'Johnson', 31, 'Male', 'Engineer', 'New York', 'NY'),
    ('Sophie', 'Clark', 29, 'Female', 'Analyst', 'Los Angeles', 'CA'),
    ('David', 'Smith', 46, 'Male', 'Data Scientist', 'Chicago', 'IL'),
    ('Emma', 'Brown', 36, 'Female', 'Software Developer', 'San Francisco', 'CA'),
    ('James', 'Taylor', 51, 'Male', 'Lawyer', 'Miami', 'FL'),
    ('Olivia', 'Miller', 33, 'Female', 'Accountant', 'Seattle', 'WA'),
    ('Lucas', 'Williams', 41, 'Male', 'Marketing Manager', 'Boston', 'MA'),
    ('Ava', 'Davis', 30, 'Female', 'Analyst', 'Austin', 'TX'),
    ('Adam', 'Jones', 39, 'Male', 'Engineer', 'Denver', 'CO'),
    ('Sophie', 'Johnson', 43, 'Female', 'Analyst', 'Atlanta', 'GA'),
    ('Alex', 'Smith', 34, 'Male', 'Software Developer', 'Portland', 'OR'),
    ('Mia', 'White', 28, 'Female', 'Data Scientist', 'San Diego', 'CA'),
    ('Daniel', 'Clarkson', 37, 'Male', 'Lawyer', 'Houston', 'TX'),
    ('Oliver', 'Martinez', 32, 'Male', 'Data Scientist', 'Phoenix', 'AZ'),
    ('Sophia', 'Lee', 49, 'Female', 'Professor', 'Philadelphia', 'PA'),
    ('Lily', 'Garcia', 30, 'Female', 'Data Scientist', 'Minneapolis', 'MN'),
    ('Matthew', 'Harris', 42, 'Male', 'Entrepreneur', 'Dallas', 'TX'),
    ('Emily', 'Taylor', 27, 'Female', 'Product Manager', 'San Jose', 'CA'),
    ('William', 'Anderson', 36, 'Male', 'Engineer', 'Detroit', 'MI'),
    ('Sophia', 'Wilson', 34, 'Female', 'HR Specialist', 'Raleigh', 'NC'),
    ('Ethan', 'Miller', 29, 'Male', 'Engineer', 'New York', 'NY'),
    ('Mia', 'Clark', 35, 'Female', 'Software Developer', 'Los Angeles', 'CA'),
    ('Jack', 'Smithson', 43, 'Male', 'Analyst', 'Chicago', 'IL'),
    ('Ava', 'Johnsonson', 38, 'Female', 'Data Scientist', 'San Francisco', 'CA'),
    ('Henry', 'Brown', 36, 'Male', 'Lawyer', 'Miami', 'FL'),
    ('Sophie', 'Davis', 27, 'Female', 'Accountant', 'Seattle', 'WA'),
    ('Noah', 'Williamson', 45, 'Male', 'Marketing Manager', 'Boston', 'MA'),
    ('Mia', 'Williams', 29, 'Female', 'Analyst', 'Austin', 'TX'),
    ('Ethan', 'Jones', 38, 'Male', 'Engineer', 'Denver', 'CO'),
    ('Ava', 'Taylor', 42, 'Female', 'Analyst', 'Atlanta', 'GA'),
    ('Alex', 'Johnson', 33, 'Male', 'Software Developer', 'Portland', 'OR'),
    ('Mia', 'White', 27, 'Female', 'Data Scientist', 'San Diego', 'CA'),
    ('Daniel', 'Clark', 41, 'Male', 'Lawyer', 'Houston', 'TX'),
    ('Olivia', 'Martinez', 32, 'Female', 'Data Scientist', 'Phoenix', 'AZ'),
    ('William', 'Lee', 47, 'Male', 'Professor', 'Philadelphia', 'PA'),
    ('Emma', 'Garcia', 30, 'Female', 'Data Scientist', 'Minneapolis', 'MN'),
    ('Ryan', 'Harris', 40, 'Male', 'Entrepreneur', 'Dallas', 'TX'),
    ('Ava', 'Taylor', 26, 'Female', 'Product Manager', 'San Jose', 'CA'),
    ('Christopher', 'Anderson', 35, 'Male', 'Engineer', 'Detroit', 'MI'),
    ('Sophie', 'Wilson', 33, 'Female', 'HR Specialist', 'Raleigh', 'NC'),
    ('Grace', 'Brown', 55, 'Female', 'Analyst', 'New York', 'NY'),
    ('Brian', 'Taylor', 50, 'Male', 'Professor', 'Los Angeles', 'CA'),
    ('Lily', 'Harrison', 42, 'Female', 'Engineer', 'Chicago', 'IL'),
    ('Kevin', 'Garcia', 55, 'Male', 'Analyst', 'San Francisco', 'CA'),
    ('Sophie', 'Smithson', 48, 'Female', 'Lawyer', 'Miami', 'FL'),
    ('John', 'Clarkston', 38, 'Male', 'Software Developer', 'Seattle', 'WA'),
    ('Emma', 'Millerson', 40, 'Female', 'Accountant', 'Boston', 'MA'),
    ('David', 'Williamson', 45, 'Male', 'Marketing Manager', 'Austin', 'TX'),
    ('Ava', 'Johnsonson', 38, 'Female', 'Analyst', 'Denver', 'CO'),
    ('Daniel', 'Anderson', 42, 'Male', 'Data Scientist', 'Atlanta', 'GA'),
    ('Michael', 'Martinson', 33, 'Male', 'Engineer', 'Portland', 'OR'),
    ('Sophia', 'Whiterson', 27, 'Female', 'Data Scientist', 'San Diego', 'CA'),
    ('Robert', 'Davidson', 41, 'Male', 'Analyst', 'Houston', 'TX'),
    ('Grace', 'Wilson', 45, 'Female', 'Data Scientist', 'Phoenix', 'AZ'),
    ('Andrew', 'Leeson', 48, 'Male', 'Professor', 'Philadelphia', 'PA'),
    ('Emily', 'Garcia', 39, 'Female', 'Data Scientist', 'Minneapolis', 'MN'),
    ('Christopher', 'Sanders', 41, 'Male', 'Entrepreneur', 'Dallas', 'TX'),
    ('Olivia', 'Martinez', 36, 'Female', 'Product Manager', 'San Jose', 'CA'),
    ('Sophie', 'Brownson', 43, 'Female', 'Engineer', 'Detroit', 'MI'),
    ('Matthew', 'Taylors', 46, 'Male', 'HR Specialist', 'Raleigh', 'NC')
"""
cursor.execute(sql_script)

# Commit the changes
conn.commit()

# Close the cursor and connection
cursor.close()
conn.close()

Use qrlew on the database#

At this stage, we have a postgres dataset that can be connected with qrlew.

%%capture
!pip install -U pyqrlew matplotlib graphviz

The dataset_from_database function, located in pyqrlew.io.dataset, enables the creation of a Dataset object from a SQLAlchemy engine. It takes the following parameters:

  • name: the dataset’s name,

  • engine: the SQLAlchemy engine establishing a connection to the database,

  • schema_name: the optional name of the schema,

  • ranges: if set to True, we use the the minimum and maximum values of numeric and datetime columns for setting their bounds. This may violates differential privacy and should be use if these values are public. Otherwise, use False. In that case, the bounds have to be provided in the SQL query,

  • possible_values_threshold: an optional integer that serves as a threshold for identifying a column as categorical. If a column surpasses this threshold, its categories are published. Similar to the ranges parameter, exercising this option may compromise differential privacy.

from pyqrlew.io.dataset import dataset_from_database
from sqlalchemy import create_engine

connect_pars= {
    'dbname':'my_database',
    'user':'postgres',
    'port':5432,
    'password':'pyqrlew-db',
    'host':'localhost',
}
engine = create_engine(f'postgresql+psycopg2://{connect_pars["user"]}:{connect_pars["password"]}@{connect_pars["host"]}:{connect_pars["port"]}/{connect_pars["dbname"]}')
ds = dataset_from_database(
    name="census",
    engine=engine,
    schema_name="census_schema",
    ranges=True,
    possible_values_threshold=5
)

qrlew connects to the database via the SQLAlchemy engine and fetch the tables.

The table "census_schema"."census_table" is represented as a Table variant of a Relation. This specific representation encapsulates all columns, their respective types, and optionally includes information about bounds and possible values.

In the provided example, we observe the bounds specification for the integer column age and the declaration of possible values for the string column gender.

import graphviz
display(graphviz.Source(ds.census_schema.census_table.relation().dot()))
../_images/4fd81fb9740f8cef28cf11958ffda46a6d1e2bb30b7bac15413b4b44130467f8.svg

Any SQL query can be transformed into a Relation, a qrlew object that allow to more easily the compilation into differential privacy.

import pandas as pd
query = "SELECT sum(age) FROM census_schema.census_table GROUP BY gender"
relation = ds.relation(query)
display(graphviz.Source(relation.dot()))
../_images/23f188cb7f880e53ef638100a585e5d637eb8cac8f9ee1b8976a37a8e3e1d849.svg
query = "SELECT gender as gender, COUNT(*)  AS count_all FROM census_schema.census_table WHERE age = 10 GROUP BY gender"
relation = ds.relation(query)
display(graphviz.Source(relation.dot()))
../_images/5727f3a3bfdfd6bd0a4b251a4e28e1a795d63380c1d4baa24a71487216fc662d.svg

Let’s convert this Relation into differential privacy with the rewrite_with_differential_privacy method that outputs:

  • privacy_unit: the privacy unit may be defined as a column within the current table or even another table. Alternatively, each row can be protected.

  • synthetic_data: this optional list associates each original table to its synthetic version. If these tables are provided, any queries that can’t be compiled into DP will be executed on the synthetic data

  • epsilon_delta: a dictionnary with the \(\varepsilon\) and \(\delta\) privacy parameters.

privacy_unit = [
    ("census_table", [], "_PRIVACY_UNIT_ROW_"), # each row is protected
]
synthetic_data = None

dp_relation = relation.rewrite_with_differential_privacy(
    dataset=ds,
    synthetic_data=synthetic_data,
    privacy_unit=privacy_unit,
    epsilon_delta={"epsilon": 1.0, "delta": 1e-3}
)

The result of rewrite_with_differential_privacy is an object that holds both the differentially private Relation and a PrivateQuery containing the invoked mechanisms during the compilation.

display(graphviz.Source(dp_relation.relation().dot()))
../_images/a9e0c847e7ae0e5525ac5ffc26d0179e2d1c020b12d718152c72663cfab3d1e2.svg

In this example, the only invocked mechanism is a gaussain mechanism for adding noise to the count.

print(dp_relation.dp_event().to_dict())
{'module_name': 'dp_accounting.dp_event', 'class_name': 'GaussianDpEvent', 'noise_multiplier': 3.776479532659047, '_fields': ['module_name', 'class_name', 'noise_multiplier']}

Any Relation can be transformed into an SQL query:

dp_sql_query = dp_relation.relation().to_query()
dp_sql_query
"WITH field_d4_o (field_d4_o) AS (SELECT * FROM (VALUES ('Female'), ('Male')) AS field_d4_o (field_d4_o)), map_xt3g (_PRIVACY_UNIT_ROW_, id, first_name, last_name, age, gender, occupation, city, state) AS (SELECT RANDOM() AS _PRIVACY_UNIT_ROW_, id AS id, first_name AS first_name, last_name AS last_name, age AS age, gender AS gender, occupation AS occupation, city AS city, state AS state FROM census_schema.census_table), map_lg9f (_PRIVACY_UNIT_, _PRIVACY_UNIT_WEIGHT_, _PRIVACY_UNIT_ROW_, id, first_name, last_name, age, gender, occupation, city, state) AS (SELECT MD5(CAST(_PRIVACY_UNIT_ROW_ AS TEXT)) AS _PRIVACY_UNIT_, 1 AS _PRIVACY_UNIT_WEIGHT_, _PRIVACY_UNIT_ROW_ AS _PRIVACY_UNIT_ROW_, id AS id, first_name AS first_name, last_name AS last_name, age AS age, gender AS gender, occupation AS occupation, city AS city, state AS state FROM map_xt3g), map_w7kv (_PRIVACY_UNIT_, _PRIVACY_UNIT_WEIGHT_, field_n9r3, field_d4_o) AS (SELECT _PRIVACY_UNIT_ AS _PRIVACY_UNIT_, _PRIVACY_UNIT_WEIGHT_ AS _PRIVACY_UNIT_WEIGHT_, 1 AS field_n9r3, gender AS field_d4_o FROM map_lg9f WHERE (age) = (10)), join_fd1m (field_d4_o, _PRIVACY_UNIT_, _PRIVACY_UNIT_WEIGHT_, field_n9r3, left__kq_o) AS (SELECT * FROM field_d4_o AS _LEFT_ LEFT JOIN map_w7kv AS _RIGHT_ ON (_LEFT_.field_d4_o) = (_RIGHT_.field_d4_o)), map_lpgc (field_d4_o, _PRIVACY_UNIT_, _PRIVACY_UNIT_WEIGHT_, field_n9r3) AS (SELECT field_d4_o AS field_d4_o, _PRIVACY_UNIT_ AS _PRIVACY_UNIT_, _PRIVACY_UNIT_WEIGHT_ AS _PRIVACY_UNIT_WEIGHT_, field_n9r3 AS field_n9r3 FROM join_fd1m), map_66hx (_PRIVACY_UNIT_, _PRIVACY_UNIT_WEIGHT_, field_d4_o, _ONE_field_n9r3) AS (SELECT COALESCE(CAST(_PRIVACY_UNIT_ AS TEXT), '_PRIVACY_UNIT_DEFAULT_') AS _PRIVACY_UNIT_, COALESCE(_PRIVACY_UNIT_WEIGHT_, 0) AS _PRIVACY_UNIT_WEIGHT_, field_d4_o AS field_d4_o, CASE WHEN field_n9r3 IS NULL THEN 0 ELSE 1 END AS _ONE_field_n9r3 FROM map_lpgc), map_2pcl (field_1iro, field_grep, field_zyse) AS (SELECT _ONE_field_n9r3 AS field_1iro, _PRIVACY_UNIT_ AS field_grep, field_d4_o AS field_zyse FROM map_66hx), reduce_3sqd (_PRIVACY_UNIT_, field_d4_o, _NORM__ONE_field_n9r3) AS (SELECT field_grep AS _PRIVACY_UNIT_, field_zyse AS field_d4_o, SUM(field_1iro) AS _NORM__ONE_field_n9r3 FROM map_2pcl GROUP BY field_grep, field_zyse), map_2n4o (_PRIVACY_UNIT_, field_d4_o, _NORM__ONE_field_n9r3) AS (SELECT _PRIVACY_UNIT_ AS _PRIVACY_UNIT_, field_d4_o AS field_d4_o, ABS((_NORM__ONE_field_n9r3) * (_NORM__ONE_field_n9r3)) AS _NORM__ONE_field_n9r3 FROM reduce_3sqd), map_hfzq (field_7ra5, field_grep) AS (SELECT _NORM__ONE_field_n9r3 AS field_7ra5, _PRIVACY_UNIT_ AS field_grep FROM map_2n4o), reduce_smed (_PRIVACY_UNIT_, _ONE_field_n9r3) AS (SELECT field_grep AS _PRIVACY_UNIT_, SUM(field_7ra5) AS _ONE_field_n9r3 FROM map_hfzq GROUP BY field_grep), map_fro8 (_PRIVACY_UNIT_, _ONE_field_n9r3) AS (SELECT _PRIVACY_UNIT_ AS _PRIVACY_UNIT_, (1) / (GREATEST(1, (SQRT(_ONE_field_n9r3)) / (1))) AS _ONE_field_n9r3 FROM reduce_smed), join_0gi9 (_PRIVACY_UNIT_, _PRIVACY_UNIT_WEIGHT_, field_d4_o, _ONE_field_n9r3, _SCALE_FACTOR__PRIVACY_UNIT_, _SCALE_FACTOR__ONE_field_n9r3) AS (SELECT * FROM map_66hx AS _LEFT_ JOIN map_fro8 AS _RIGHT_ ON (_LEFT_._PRIVACY_UNIT_) = (_RIGHT_._PRIVACY_UNIT_)), map_m3cm (_PRIVACY_UNIT_, _PRIVACY_UNIT_WEIGHT_, field_d4_o, _ONE_field_n9r3, _SCALE_FACTOR__PRIVACY_UNIT_, _SCALE_FACTOR__ONE_field_n9r3, _CLIPPED__ONE_field_n9r3) AS (SELECT _PRIVACY_UNIT_ AS _PRIVACY_UNIT_, _PRIVACY_UNIT_WEIGHT_ AS _PRIVACY_UNIT_WEIGHT_, field_d4_o AS field_d4_o, _ONE_field_n9r3 AS _ONE_field_n9r3, _SCALE_FACTOR__PRIVACY_UNIT_ AS _SCALE_FACTOR__PRIVACY_UNIT_, _SCALE_FACTOR__ONE_field_n9r3 AS _SCALE_FACTOR__ONE_field_n9r3, (_ONE_field_n9r3) * (_SCALE_FACTOR__ONE_field_n9r3) AS _CLIPPED__ONE_field_n9r3 FROM join_0gi9), map_orr7 (field_8d51, field_zyse) AS (SELECT _CLIPPED__ONE_field_n9r3 AS field_8d51, field_d4_o AS field_zyse FROM map_m3cm), reduce_tdcw (field_d4_o, _COUNT_field_n9r3) AS (SELECT field_zyse AS field_d4_o, SUM(field_8d51) AS _COUNT_field_n9r3 FROM map_orr7 GROUP BY field_zyse), map_m6is (field_d4_o, _COUNT_field_n9r3) AS (SELECT field_d4_o AS field_d4_o, LEAST(140, GREATEST(0, (COALESCE(_COUNT_field_n9r3, 0)) + ((3.776479532659047) * ((SQRT((-2) * (LN(RANDOM())))) * (COS((6.283185307179586) * (RANDOM()))))))) AS _COUNT_field_n9r3 FROM reduce_tdcw), map_itya (field_i4xy, field_d4_o) AS (SELECT CAST(_COUNT_field_n9r3 AS INTEGER) AS field_i4xy, field_d4_o AS field_d4_o FROM map_m6is), map_09t5 (field_d4_o, field_i4xy) AS (SELECT field_d4_o AS field_d4_o, field_i4xy AS field_i4xy FROM map_itya), map_vdvc (gender, count_all) AS (SELECT field_d4_o AS gender, field_i4xy AS count_all FROM map_09t5) SELECT * FROM map_vdvc"

The DP query can then be sent to the database:

dp_results = pd.read_sql(dp_sql_query, engine)
dp_results
gender count_all
0 Female 1
1 Male 3

This can be compared to the result of the original query:

exact_results = pd.read_sql(query, engine)
exact_results
gender count_all

Queries with GROUP BY clause#

f the grouping keys are not public, we utilize the \(\tau\)-thresholding mechanism, which filters out keys with noisy counts below a specific threshold determined by the privacy parameters.

query = """
    SELECT occupation as occupation, COUNT(*) AS count_occ, AVG(age) AS avg_age
    FROM census_schema.census_table
    GROUP BY occupation
"""
relation = ds.relation(query)
privacy_unit = [
    ("census_table", [], "_PRIVACY_UNIT_ROW_"),
]
synthetic_data = None

dp_relation = relation.rewrite_with_differential_privacy(
    dataset=ds,
    synthetic_data=synthetic_data,
    privacy_unit=privacy_unit,
    epsilon_delta={"epsilon": 5.0, "delta": 5e-4}
)
print(dp_relation.dp_event().to_dict())
{'module_name': 'dp_accounting.dp_event', 'class_name': 'ComposedDpEvent', 'events': [{'module_name': 'dp_accounting.dp_event', 'class_name': 'EpsilonDeltaDpEvent', 'epsilon': 2.5, 'delta': 0.00025, '_fields': ['module_name', 'class_name', 'epsilon', 'delta']}, {'module_name': 'dp_accounting.dp_event', 'class_name': 'GaussianDpEvent', 'noise_multiplier': 5.2624632808831, '_fields': ['module_name', 'class_name', 'noise_multiplier']}, {'module_name': 'dp_accounting.dp_event', 'class_name': 'GaussianDpEvent', 'noise_multiplier': 5.2624632808831, '_fields': ['module_name', 'class_name', 'noise_multiplier']}, {'module_name': 'dp_accounting.dp_event', 'class_name': 'GaussianDpEvent', 'noise_multiplier': 289.4354804485705, '_fields': ['module_name', 'class_name', 'noise_multiplier']}], '_fields': ['module_name', 'class_name', 'events']}

In that case, 3 gaussian mechanisms have been used for adding noise to the aggregations (1 for the count, 2 for the average) and a \((\varepsilon, \delta)\) mechanism for the \(\tau\)-thresholding.

If you execute the next cell multiple times, you will notice that the grouping keys may vary due to the randomness introduced by the \(\tau\)-thresholding mechanism.

dp_results = pd.read_sql(dp_relation.relation().to_query(), engine)
dp_results
occupation count_occ avg_age
0 Analyst 17 74.085976
1 Engineer 21 22.989924
2 Data Scientist 19 31.201097

We can also include the grouping keys in the query’s WHERE clause. In this case, the grouping keys are treated as public, and they all appear in the result and the \(\tau\)-thresholding mechnaism is not used.

query = """
    SELECT occupation as occupation, COUNT(*) AS count_occ, AVG(age) AS avg_age
    FROM census_schema.census_table
    WHERE occupation in ('Analyst', 'Professor')
    GROUP BY occupation
"""
relation = ds.relation(query)
privacy_unit = [
    ("census_table", [], "_PRIVACY_UNIT_ROW_"),
]
synthetic_data = None

dp_relation = relation.rewrite_with_differential_privacy(
    dataset=ds,
    synthetic_data=synthetic_data,
    privacy_unit=privacy_unit,
    epsilon_delta={"epsilon": 5.0, "delta": 5e-4}
)
print(dp_relation.dp_event().to_dict())
{'module_name': 'dp_accounting.dp_event', 'class_name': 'ComposedDpEvent', 'events': [{'module_name': 'dp_accounting.dp_event', 'class_name': 'GaussianDpEvent', 'noise_multiplier': 2.5346230440950324, '_fields': ['module_name', 'class_name', 'noise_multiplier']}, {'module_name': 'dp_accounting.dp_event', 'class_name': 'GaussianDpEvent', 'noise_multiplier': 2.5346230440950324, '_fields': ['module_name', 'class_name', 'noise_multiplier']}, {'module_name': 'dp_accounting.dp_event', 'class_name': 'GaussianDpEvent', 'noise_multiplier': 139.40426742522678, '_fields': ['module_name', 'class_name', 'noise_multiplier']}], '_fields': ['module_name', 'class_name', 'events']}
dp_results = pd.read_sql(dp_relation.relation().to_query(), engine)
dp_results
occupation count_occ avg_age
0 Professor 0 38.499150
1 Analyst 18 36.381657

Not DP-compilable queries#

If the query cannot by DP-compiled by qrlew and no synthetic data have been provided, then qrlew returns an error.

query = """
    SELECT occupation as occupation, MAX(age) AS max_age
    FROM census_schema.census_table
    GROUP BY occupation
"""
relation = ds.relation(query)
privacy_unit = [
    ("census_table", [], "_PRIVACY_UNIT_ROW_"),
]
synthetic_data = None
try:
    dp_relation = relation.rewrite_with_differential_privacy(
        dataset=ds,
        synthetic_data=synthetic_data,
        privacy_unit=privacy_unit,
        epsilon_delta={"epsilon": 5.0, "delta": 5e-4}
    ).relation()
    dp_results = pd.read_sql(dp_relation.to_query(), engine)
    dp_results
except RuntimeError as e:
    print(f"The error is: {e}")
The error is: UnreachableProperty: differential_privacy is unreachable

If the user provides the list of the original and synthetic tables, then the query is sent to the synthetic tables and no budget has been spent.

query = """
    SELECT occupation as occupation, MAX(age) AS max_age
    FROM census_schema.census_table
    GROUP BY occupation
"""
relation = ds.relation(query)
privacy_unit = [("census_table", [], "_PRIVACY_UNIT_ROW_")]
synthetic_data = [
    (["census_schema", "census_table"], ["census_schema", "synthetic_census_table"])
]

dp_relation = relation.rewrite_with_differential_privacy(
    dataset=ds,
    synthetic_data=synthetic_data,
    privacy_unit=privacy_unit,
    epsilon_delta={"epsilon": 5.0, "delta": 5e-4}
)
print(dp_relation.dp_event().to_dict())
{'module_name': 'dp_accounting.dp_event', 'class_name': 'NoOpDpEvent', '_fields': ['module_name', 'class_name']}