import gspread
from google.oauth2 import service_account as sa
from googleapiclient.discovery import build
from .abstract_dataset_builder import *
[docs]
class GoogleSpreadsheetDatasetBuilder(AbstractDatasetBuilder):
"""
The GoogleSpreadsheetDatasetBuilder class is responsible for building datasets
from Google Sheets using named ranges. It leverages the gspread library to
interact with the Google Sheets API.
Attributes:
DEFAULT_SA_PATH (str): The default path to the service account credentials JSON file.
DEFAULT_SCOPES (list): The default OAuth 2.0 scopes required for authentication.
Methods:
__init__(sa_path=None): Initializes the dataset builder, authenticating with Google Drive.
build_dataset(params, ws_index=0): Builds and returns a dataset based on the given parameters.
"""
DEFAULT_SA_PATH = "./service_account.json"
DEFAULT_SCOPES = ['https://www.googleapis.com/auth/drive']
def __init__(self, file_id=None, sa_path=None) -> None:
self.file_id = file_id
self.sa_path = sa_path or self.DEFAULT_SA_PATH
self.auth = sa.Credentials.from_service_account_file(
self.sa_path,
scopes=self.DEFAULT_SCOPES
)
self.dataset = dict()
[docs]
def build(self, params=None, ws_index=0):
"""
Builds and returns a dataset based on the provideparameters, including named ranges
and an optional worksheet index
Args:
params (dict): Dictionary containing requireparameters such as file_id and ranges.
ws_index (int, optional): Index of the worksheet tbe used (default is 0)
Returns:
Dataset: A dictionary containing the retrieved datmapped by named ranges.
"""
gs = gspread.service_account(self.sa_path)
range_sets = dict()
for el in params["ranges"]:
if not isinstance(el, tuple):
el = (el, self.file_id)
named_range, file_id = el
if not file_id in range_sets:
range_sets[file_id] = []
range_sets[file_id].append(named_range)
for file_id, ranges in range_sets.items():
sheet = gs.open_by_key(file_id)
worksheet = sheet.get_worksheet(ws_index)
response = worksheet.batch_get(
ranges,
value_render_option="UNFORMATTED_VALUE",
)
response_ = {}
for i in range(len(response)):
# TODO @mariana.pais Make sure the system is able to handle missing data. This is just a quick fix for the process to work.
try:
response_[ranges[i]] = round(response[i][0][0], 3)
except Exception:
pass
self.dataset.update(response_)
return self.dataset