Managing Datasets¶
A dataset represents a collection of tables, and applies several default policies to tables as they are created:
An access control list (ACL). When created, a dataset has an ACL which maps to the ACL inherited from its project.
A default table expiration period. If set, tables created within the dataset will have the value as their expiration period.
See BigQuery documentation for more information on Datasets.
Listing Datasets¶
List datasets for a project with the
list_datasets()
method:
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
datasets = list(client.list_datasets()) # Make an API request.
project = client.project
if datasets:
print("Datasets in project {}:".format(project))
for dataset in datasets:
print("\t{}".format(dataset.dataset_id))
else:
print("{} project does not contain any datasets.".format(project))
List datasets by label for a project with the
list_datasets()
method:
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
label_filter = "labels.color:green"
datasets = list(client.list_datasets(filter=label_filter)) # Make an API request.
if datasets:
print("Datasets filtered by {}:".format(label_filter))
for dataset in datasets:
print("\t{}.{}".format(dataset.project, dataset.dataset_id))
else:
print("No datasets found with this filter.")
Getting a Dataset¶
Get a dataset resource (to pick up changes made by another client) with the
get_dataset()
method:
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
# TODO(developer): Set dataset_id to the ID of the dataset to fetch.
# dataset_id = 'your-project.your_dataset'
dataset = client.get_dataset(dataset_id) # Make an API request.
full_dataset_id = "{}.{}".format(dataset.project, dataset.dataset_id)
friendly_name = dataset.friendly_name
print(
"Got dataset '{}' with friendly_name '{}'.".format(
full_dataset_id, friendly_name
)
)
# View dataset properties.
print("Description: {}".format(dataset.description))
print("Labels:")
labels = dataset.labels
if labels:
for label, value in labels.items():
print("\t{}: {}".format(label, value))
else:
print("\tDataset has no labels defined.")
# View tables in dataset.
print("Tables:")
tables = list(client.list_tables(dataset)) # Make an API request(s).
if tables:
for table in tables:
print("\t{}".format(table.table_id))
else:
print("\tThis dataset does not contain any tables.")
Determine if a dataset exists with the
get_dataset()
method:
from google.cloud import bigquery
from google.cloud.exceptions import NotFound
client = bigquery.Client()
# TODO(developer): Set dataset_id to the ID of the dataset to determine existence.
# dataset_id = "your-project.your_dataset"
try:
client.get_dataset(dataset_id) # Make an API request.
print("Dataset {} already exists".format(dataset_id))
except NotFound:
print("Dataset {} is not found".format(dataset_id))
Creating a Dataset¶
Create a new dataset with the
create_dataset()
method:
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
# TODO(developer): Set dataset_id to the ID of the dataset to create.
# dataset_id = "{}.your_dataset".format(client.project)
# Construct a full Dataset object to send to the API.
dataset = bigquery.Dataset(dataset_id)
# TODO(developer): Specify the geographic location where the dataset should reside.
dataset.location = "US"
# Send the dataset to the API for creation, with an explicit timeout.
# Raises google.api_core.exceptions.Conflict if the Dataset already
# exists within the project.
dataset = client.create_dataset(dataset, timeout=30) # Make an API request.
print("Created dataset {}.{}".format(client.project, dataset.dataset_id))
Updating a Dataset¶
Update a property in a dataset’s metadata with the
update_dataset()
method:
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
# TODO(developer): Set dataset_id to the ID of the dataset to fetch.
# dataset_id = 'your-project.your_dataset'
dataset = client.get_dataset(dataset_id) # Make an API request.
dataset.description = "Updated description."
dataset = client.update_dataset(dataset, ["description"]) # Make an API request.
full_dataset_id = "{}.{}".format(dataset.project, dataset.dataset_id)
print(
"Updated dataset '{}' with description '{}'.".format(
full_dataset_id, dataset.description
)
)
Modify user permissions on a dataset with the
update_dataset()
method:
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
# TODO(developer): Set dataset_id to the ID of the dataset to fetch.
# dataset_id = 'your-project.your_dataset'
dataset = client.get_dataset(dataset_id) # Make an API request.
entry = bigquery.AccessEntry(
role="READER",
entity_type="userByEmail",
entity_id="sample.bigquery.dev@gmail.com",
)
entries = list(dataset.access_entries)
entries.append(entry)
dataset.access_entries = entries
dataset = client.update_dataset(dataset, ["access_entries"]) # Make an API request.
full_dataset_id = "{}.{}".format(dataset.project, dataset.dataset_id)
print(
"Updated dataset '{}' with modified user permissions.".format(full_dataset_id)
)
Manage Dataset labels¶
Add labels to a dataset with the
update_dataset()
method:
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
# TODO(developer): Set dataset_id to the ID of the dataset to fetch.
# dataset_id = "your-project.your_dataset"
dataset = client.get_dataset(dataset_id) # Make an API request.
dataset.labels = {"color": "green"}
dataset = client.update_dataset(dataset, ["labels"]) # Make an API request.
print("Labels added to {}".format(dataset_id))
Get dataset’s labels with the
get_dataset()
method:
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
# TODO(developer): Set dataset_id to the ID of the dataset to fetch.
# dataset_id = "your-project.your_dataset"
dataset = client.get_dataset(dataset_id) # Make an API request.
# View dataset labels.
print("Dataset ID: {}".format(dataset_id))
print("Labels:")
if dataset.labels:
for label, value in dataset.labels.items():
print("\t{}: {}".format(label, value))
else:
print("\tDataset has no labels defined.")
Delete dataset’s labels with the
update_dataset()
method:
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
# TODO(developer): Set dataset_id to the ID of the dataset to fetch.
# dataset_id = "your-project.your_dataset"
dataset = client.get_dataset(dataset_id) # Make an API request.
# To delete a label from a dataset, set its value to None.
dataset.labels["color"] = None
dataset = client.update_dataset(dataset, ["labels"]) # Make an API request.
print("Labels deleted from {}".format(dataset_id))
Deleting a Dataset¶
Delete a dataset with the
delete_dataset()
method:
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
# TODO(developer): Set model_id to the ID of the model to fetch.
# dataset_id = 'your-project.your_dataset'
# Use the delete_contents parameter to delete a dataset and its contents.
# Use the not_found_ok parameter to not receive an error if the dataset has already been deleted.
client.delete_dataset(
dataset_id, delete_contents=True, not_found_ok=True
) # Make an API request.
print("Deleted dataset '{}'.".format(dataset_id))