Source code for visual_viper.dataset_builders.gs_dataset_builder

import gspread
from google.oauth2 import service_account as sa
from googleapiclient.discovery import build

from .abstract_dataset_builder import *


[docs]
class GoogleSpreadsheetDatasetBuilder(AbstractDatasetBuilder):
  """
  The GoogleSpreadsheetDatasetBuilder class is responsible for building datasets
  from Google Sheets using named ranges. It leverages the gspread library to
  interact with the Google Sheets API.

  Attributes:
    DEFAULT_SA_PATH (str): The default path to the service account credentials JSON file.
    DEFAULT_SCOPES (list): The default OAuth 2.0 scopes required for authentication.

  Methods:
    __init__(sa_path=None): Initializes the dataset builder, authenticating with Google Drive.
    build_dataset(params, ws_index=0): Builds and returns a dataset based on the given parameters.
  """
  DEFAULT_SA_PATH = "./service_account.json"
  DEFAULT_SCOPES = ['https://www.googleapis.com/auth/drive']

  def __init__(self, file_id=None, sa_path=None) -> None:

    self.file_id = file_id
    self.sa_path = sa_path or self.DEFAULT_SA_PATH
    self.auth = sa.Credentials.from_service_account_file(
      self.sa_path,
      scopes=self.DEFAULT_SCOPES
    )
    self.dataset = dict()


[docs]
  def build(self, params=None, ws_index=0):
    """
    Builds and returns a dataset based on the provideparameters, including named ranges
    and an optional worksheet index

    Args:
      params (dict): Dictionary containing requireparameters such as file_id and ranges.
      ws_index (int, optional): Index of the worksheet tbe used (default is 0)

    Returns:
      Dataset: A dictionary containing the retrieved datmapped by named ranges.
    """
    gs = gspread.service_account(self.sa_path)

    range_sets = dict()

    for el in params["ranges"]:

      if not isinstance(el, tuple):
        el = (el, self.file_id)

      named_range, file_id = el
      if not file_id in range_sets:
        range_sets[file_id] = []
      range_sets[file_id].append(named_range)


    for file_id, ranges in range_sets.items():
      sheet = gs.open_by_key(file_id)
      worksheet = sheet.get_worksheet(ws_index)

      response = worksheet.batch_get(
        ranges,
        value_render_option="UNFORMATTED_VALUE",
      )
      response_ = {}
      for i in range(len(response)):
        # TODO @mariana.pais Make sure the system is able to handle missing data. This is just a quick fix for the process to work.
        try:
          response_[ranges[i]] = round(response[i][0][0], 3)
        except Exception:
          pass

      self.dataset.update(response_)

    return self.dataset