Cloud DLP Architecture
Implementation
from google.cloud import dlp_v2
client = dlp_v2.DlpServiceClient()
# Inspect data for PII
def inspect_data(project_id, content):
"""Inspect data for sensitive information."""
parent = f"projects/{project_id}"
inspect_config = {
"info_types": [
{"name": "EMAIL_ADDRESS"},
{"name": "PHONE_NUMBER"},
{"name": "CREDIT_CARD_NUMBER"},
{"name": "US_SOCIAL_SECURITY_NUMBER"},
{"name": "PERSON_NAME"}
],
"min_likelihood": "LIKELY",
"max_findings": 100
}
response = client.inspect_content(
request={
"parent": parent,
"inspect_config": inspect_config,
"item": {"value": content}
}
)
return response.result.info_type_inspectations
# De-identify sensitive data
def deidentify_data(project_id, content):
"""De-identify sensitive data."""
parent = f"projects/{project_id}"
deidentify_config = {
"info_type_transformations": {
"transformations": [
{
"info_types": [{"name": "EMAIL_ADDRESS"}],
"primitive_transformation": {
"character_mask_config": {
"masking_character": "*",
"number_to_mask": 0,
"reverse_order": False
}
}
},
{
"info_types": [{"name": "PHONE_NUMBER"}],
"primitive_transformation": {
"replace_config": {
"new_value": {
"string_value": "[PHONE REDACTED]"
}
}
}
}
]
}
}
response = client.deidentify_content(
request={
"parent": parent,
"deidentify_config": deidentify_config,
"item": {"value": content}
}
)
return response.result.item.value
# Risk analysis
def analyze_risk(project_id, table_reference):
"""Analyze re-identification risk."""
parent = f"projects/{project_id}"
risk_analysis_config = {
"quasi_ids": [
{"name": "age", "type_": "INTEGER"},
{"name": "zip_code", "type_": "STRING"},
{"name": "gender", "type_": "STRING"}
],
"numeric_stats_result_method": "MEAN",
"categorical_stats_result_method": "KY_ANONYMITY",
"k_anonymity_config": {
"quasi_ids_field": [
{"name": "age"},
{"name": "zip_code"},
{"name": "gender"}
]
}
}
response = client.reidentify_content(
request={
"parent": parent,
"reidentify_config": deidentify_config,
"item": {"table": table_reference}
}
)
return response.result
β¨
Best Practice: Use Cloud DLP to scan data lakes for PII. Implement de-identification for non-production environments. Use risk analysis to identify quasi-identifiers. Create inspection templates for consistency. Schedule regular scans for compliance.
Common Interview Questions
Q1: What are the de-identification methods in Cloud DLP?
Answer: 1) Masking (character masking), 2) Redaction (replace with placeholder), 3) Tokenization (reversible), 4) Bucketing (numeric ranges), 5) Crypto hashing (irreversible). Choose based on reversibility and compliance requirements.
Q2: When should you use tokenization vs. masking?
Answer: Tokenization is reversible - use when you need to recover original data (testing, analytics). Masking is irreversible - use for permanent de-identification (GDPR compliance). Tokenization requires a token vault; masking is simpler.
Q3: What is k-anonymity?
Answer: K-anonymity ensures each record is indistinguishable from at least k-1 other records based on quasi-identifiers. Higher k values mean lower re-identification risk. Cloud DLP can calculate k-anonymity for datasets.
Q4: How do you scan BigQuery tables for PII?
Answer: Use Cloud DLP inspection jobs to scan BigQuery tables. Configure info types and minimum likelihood. Results can be exported to BigQuery for analysis. Schedule scans for ongoing monitoring.
Q5: What are the compliance benefits of Cloud DLP?
Answer: 1) Automated PII detection, 2) Consistent de-identification, 3) Risk analysis for quasi-identifiers, 4) Audit logging, 5) Template-based policies, 6) Integration with Dataplex for governance.