diff --git a/GDPR.md b/GDPR.md new file mode 100644 index 0000000..c1622ad --- /dev/null +++ b/GDPR.md @@ -0,0 +1,122 @@ +# GDPR compliance and data request handling + +Since the Error Tracker collects sensitive user data, a lot of care must be +taken with the production environment. + +It can happen that the Canonical legal team receives GDPR (or equivalents) +data request to either dump and/or remove, in which case the Error Tracker +maintainers will be contacted to provide an answer. This document helps +understanding what is being processed and how to handle the request. + + +## High level view of the data collection and processing + + +### The `whoopsie-id` identifier + +The `whoopsie-id` is a long string uniquely (hopefully) identifying every Ubuntu +installation. + +To find the `whoopsie-id` on a machine, simply look at the content of the +following file: +``` +/var/lib/whoopsie/whoopsie-id +``` + +This is the only mean there is to actually identify the user data when someone +makes such request. If the data request doesn't have such an ID, you must ask +for it, or decline the processing, as there simply is no mean of making sure the +request actually comes from a legitimate user, and we want to avoid sending user +data to the wrong person. + +### Where do the user data live? + +When a user's machine sends a crash to `daisy.ubuntu.com`, there are two places +where the data can be stored: + +1. Most of the entries in the crash report will be stored in the `OOPS` table + of the Cassandra database, with the `key` column refering to the OOPS ID for + each row, allowing quick retrieval of all of them. The mapping between the + `whoopsie-id` and the newly created OOPS ID is kept in `UserOOPS`. + +2. The only entry of a crash report not stored in Cassandra is the `CoreDump` + key, which is stored temporarily in a dedicated `swift` bucket. This + coredump, when uploaded (only when needed), is only kept for the time of the + "retrace" to happen. + The purpose of the "retrace" process, is to load the coredump with additional + debugging symbols installed, so that the stacktrace newly obtained is more + useful to developers, because it would contain source code references. If + that "retrace" process already happened, or is not needed, the coredump is + never uploaded and stays on a user's machine. + +### Data retention policy + +There is currently no fully automated data removal script in the Error Tracker. +However, there are a couple of processes in place already to avoid data staying +there forever (especially user-data). +1. When a release goes End of Life, all its OOPSes are removed. In practice, + that means about 9 months for an interim Ubuntu release, and 5 years for an + LTS Ubuntu release. LTS data are usually handled first by step 2 below. +2. All OOPSes are manually cleaned after a couple of years, so in practice, the + Error Tracker only stores user data for about 3 years. Aggregated data such + as counters or indexes might stay longer though. +3. The coredumps are always removed as soon as they've been processed, and + they're not necessary anymore. In practice, the time to process depends on + the length of the processing queue, and can go from a couple of minutes to + about 10-15 days. + +### Who has access? + +There are two kinds of access to the data: through the web UI/API, and on the infrastructure itself. +1. From the web UI, the only Launchpad users that + are allowed are part of the Launchpad group + [`~error-tracker-access`](https://launchpad.net/~error-tracker-access). + The web UI only provides access to what's stored in Cassandra, so no coredump + is ever accessible. +2. On the infrastructure, only some selected Canonical employee such as SREs or + Error Tracker maintainers have access to the data. That includes coredump in + the `swift` bucket, for the time those coredumps are waiting to be processed. + + +## Handling a GDPR data request + +To handle such cases, the `src/tools/gdpr.py` script can be used, which takes a +`whoopsie-id` as it's primary input. + +### Dumping data + +The `gdpr.py` script will by default dump the data without deleting them. The +dumped data will take the form of a tarball written to the current directory, +with the following name pattern: `error-tracker--.tar.gz` + +A `--no-dump` flag is provided in case dumping is not needed. + +### Deleting data + +The `gdpr.py` script will by default not remove the data, to prevent accidental +data loss. + +A `--remove` flag is provided in case deletion is needed. + +### Answering the request + +Here is an email template that can be used to answer a GDPR data request: + +``` +Hi [User's Name], + +We have received your GDPR request for a copy of your personal data and its subsequent deletion. + +1. Your data download +You can access your data dump here: [Insert Link to Data]. This link will remain active for [7 days]. + +2. Data deletion +As requested, we have initiated the permanent deletion of your data from our systems. This process can take up to 10 days to fully propagate across all storage nodes. + +You can find more information on the data processing here: https://github.com/ubuntu/error-tracker/blob/main/GDPR.md +If you have any further questions, please reply directly to this email. + +Best regards, + +[Your Name/Company Name] [Contact Information] +``` diff --git a/src/tools/gdpr.py b/src/tools/gdpr.py new file mode 100755 index 0000000..2330e8e --- /dev/null +++ b/src/tools/gdpr.py @@ -0,0 +1,145 @@ +#!/usr/bin/python3 + +# Dump and or delete all the crash reports from a specific systemid like +# https://errors.ubuntu.com/user/cfc8a68e9841db904b074a1135c3e6514ac806e675445489d5ad3aa09633fe2d968c6918cb9f343f2c7a353461ab93afcbccf176af756b7426f75935afc64cb2 +import io +import json +import sys +import tarfile +from argparse import ArgumentParser +from datetime import datetime +from pathlib import Path + +import swiftclient +from problem_report import CompressedValue, _base64_decoder + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from errors import cassie +from errortracker import cassandra, config +from errortracker.cassandra_schema import OOPS, Stacktrace, UserOOPS +from errortracker.swift_utils import get_swift_client + +cassandra.setup_cassandra() +swift = get_swift_client() +now = datetime.now() + + +def parse_args(): + parser = ArgumentParser(description="GDPR request handler for the Error Tracker") + parser.add_argument( + "--no-dump", + action="store_true", + help="Do not generate an archive tarball of the data associated with this whoopsie ID", + ) + parser.add_argument( + "--remove", + action="store_true", + help="Delete all the data associated with this whoopsie ID", + ) + parser.add_argument( + "whoopsie_id", + help="The whoopsie ID to handle data for", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + + if not args.no_dump: + dump(args.whoopsie_id) + + if args.remove: + remove(args.whoopsie_id) + + +def dump(whoopsie_id): + print(f"Dumping data for {whoopsie_id} as of {now}") + with tarfile.open( + f"error-tracker-{whoopsie_id}-{now.isoformat(timespec='seconds')}.tar.gz", + "w|gz", + ) as tar: + oopses = UserOOPS.objects.filter(key=whoopsie_id.encode()) + for oops in oopses: + oopsid = oops.column1 + + oops = cassie.get_crash(oopsid) + print(f"Handling OOPS {oopsid}") + + # Handle bucket information + if "StacktraceAddressSignature" in oops: + sas = oops["StacktraceAddressSignature"].encode() + oops["Stacktrace"] = ( + Stacktrace.objects.filter(key=sas, column1="Stacktrace") + .values_list("value", flat=True) + .first() + ) + oops["ThreadStacktrace"] = ( + Stacktrace.objects.filter(key=sas, column1="ThreadStacktrace") + .values_list("value", flat=True) + .first() + ) + print(f" Added Stacktrace and ThreadStacktrace for OOPS {oopsid}") + + # save OOPS data in tarball + json_bytes = json.dumps(oops).encode("utf-8") + tarinfo = tarfile.TarInfo(name=f"oops/{oopsid}.json") + tarinfo.size = len(json_bytes) + tar.addfile(tarinfo, io.BytesIO(json_bytes)) + + # handle possible core file still around + try: + tarinfo = tarfile.TarInfo(name=f"cores/{oopsid}.core") + _, body = swift.get_object(config.swift_bucket, oopsid, resp_chunk_size=65536) + compressed_core_bytes = io.BytesIO() + for chunk in body: + compressed_core_bytes.write(chunk) + compressed_core_bytes.seek(0) + core_bytes = io.BytesIO() + for block in CompressedValue.decode_compressed_stream( + _base64_decoder(compressed_core_bytes) + ): + core_bytes.write(block) + core_bytes.seek(0) + tarinfo.size = core_bytes.getbuffer().nbytes + tar.addfile(tarinfo, core_bytes) + print(f" Added core for OOPS {oopsid}") + except swiftclient.exceptions.ClientException as e: + if "404 Not Found" in str(e): + pass + else: + raise e + + +def remove(whoopsie_id): + print(f"Removing data for {whoopsie_id} as of {now}") + oopses = UserOOPS.objects.filter(key=whoopsie_id.encode()) + for oops in oopses: + oopsid = oops.column1 + + oops = cassie.get_crash(oopsid) + if "StacktraceAddressSignature" in oops: + sas = oops["StacktraceAddressSignature"].encode() + Stacktrace.objects.filter(key=sas, column1="Stacktrace").delete() + Stacktrace.objects.filter(key=sas, column1="ThreadStacktrace").delete() + print(f"Deleted Stacktrace and ThreadStacktrace for {sas}") + + try: + swift.delete_object(config.swift_bucket, oopsid) + print(f"Deleted core from swift {oopsid}") + except swiftclient.exceptions.ClientException as e: + if "404 Not Found" in str(e): + pass + else: + raise e + + OOPS.objects.filter(key=oopsid.encode()).delete() + print(f"Deleted OOPS {oopsid}") + + UserOOPS.objects.filter(key=whoopsie_id.encode()).delete() + print(f"Deleted UserOOPSes for {whoopsie_id}") + + +if __name__ == "__main__": + main()