Skip to content

Json

DataBricksAutoLoaderSource

Bases: SourceInterface

The Spark Auto Loader is used to read new data files as they arrive in cloud storage. Further information on Auto Loader is available here

Example

from rtdip_sdk.pipelines.sources import DataBricksAutoLoaderSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

options = {}
path = "abfss://{FILE-SYSTEM}@{ACCOUNT-NAME}.dfs.core.windows.net/{PATH}/{FILE-NAME}
format = "{DESIRED-FILE-FORMAT}"

DataBricksAutoLoaderSource(spark, options, path, format).read_stream()

OR

DataBricksAutoLoaderSource(spark, options, path, format).read_batch()
from rtdip_sdk.pipelines.sources import DataBricksAutoLoaderSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

options = {}
path = "https://s3.{REGION-CODE}.amazonaws.com/{BUCKET-NAME}/{KEY-NAME}"
format = "{DESIRED-FILE-FORMAT}"

DataBricksAutoLoaderSource(spark, options, path, format).read_stream()

OR

DataBricksAutoLoaderSource(spark, options, path, format).read_batch()
from rtdip_sdk.pipelines.sources import DataBricksAutoLoaderSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

options = {}
path = "gs://{BUCKET-NAME}/{FILE-PATH}"
format = "{DESIRED-FILE-FORMAT}"

DataBricksAutoLoaderSource(spark, options, path, format).read_stream()

OR

DataBricksAutoLoaderSource(spark, options, path, format).read_batch()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session required to read data from cloud storage

required
options dict

Options that can be specified for configuring the Auto Loader. Further information on the options available are here

required
path str

The cloud storage path

required
format str

Specifies the file format to be read. Supported formats are available here

required
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/autoloader.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
class DataBricksAutoLoaderSource(SourceInterface):
    """
    The Spark Auto Loader is used to read new data files as they arrive in cloud storage. Further information on Auto Loader is available [here](https://docs.databricks.com/ingestion/auto-loader/index.html)

    Example
    --------
    === "ADLS Gen2"

        ```python
        from rtdip_sdk.pipelines.sources import DataBricksAutoLoaderSource
        from rtdip_sdk.pipelines.utilities import SparkSessionUtility

        # Not required if using Databricks
        spark = SparkSessionUtility(config={}).execute()

        options = {}
        path = "abfss://{FILE-SYSTEM}@{ACCOUNT-NAME}.dfs.core.windows.net/{PATH}/{FILE-NAME}
        format = "{DESIRED-FILE-FORMAT}"

        DataBricksAutoLoaderSource(spark, options, path, format).read_stream()

        OR

        DataBricksAutoLoaderSource(spark, options, path, format).read_batch()
        ```
    === "AWS S3"

        ```python
        from rtdip_sdk.pipelines.sources import DataBricksAutoLoaderSource
        from rtdip_sdk.pipelines.utilities import SparkSessionUtility

        # Not required if using Databricks
        spark = SparkSessionUtility(config={}).execute()

        options = {}
        path = "https://s3.{REGION-CODE}.amazonaws.com/{BUCKET-NAME}/{KEY-NAME}"
        format = "{DESIRED-FILE-FORMAT}"

        DataBricksAutoLoaderSource(spark, options, path, format).read_stream()

        OR

        DataBricksAutoLoaderSource(spark, options, path, format).read_batch()
        ```
    === "GCS"

        ```python
        from rtdip_sdk.pipelines.sources import DataBricksAutoLoaderSource
        from rtdip_sdk.pipelines.utilities import SparkSessionUtility

        # Not required if using Databricks
        spark = SparkSessionUtility(config={}).execute()

        options = {}
        path = "gs://{BUCKET-NAME}/{FILE-PATH}"
        format = "{DESIRED-FILE-FORMAT}"

        DataBricksAutoLoaderSource(spark, options, path, format).read_stream()

        OR

        DataBricksAutoLoaderSource(spark, options, path, format).read_batch()
        ```

    Parameters:
        spark (SparkSession): Spark Session required to read data from cloud storage
        options (dict): Options that can be specified for configuring the Auto Loader. Further information on the options available are [here](https://docs.databricks.com/ingestion/auto-loader/options.html)
        path (str): The cloud storage path
        format (str): Specifies the file format to be read. Supported formats are available [here](https://docs.databricks.com/ingestion/auto-loader/options.html#file-format-options)
    """

    spark: SparkSession
    options: dict
    path: str

    def __init__(
        self, spark: SparkSession, options: dict, path: str, format: str
    ) -> None:
        self.spark = spark
        self.options = options
        self.path = path
        self.options["cloudFiles.format"] = format

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK on Databricks
        """
        return SystemType.PYSPARK_DATABRICKS

    @staticmethod
    def libraries():
        libraries = Libraries()
        libraries.add_maven_library(get_default_package("spark_delta_core"))
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_read_validation(self):
        return True

    def post_read_validation(self, df: DataFrame):
        return True

    def read_batch(self):
        """
        Raises:
            NotImplementedError: Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method of this component and specify the Trigger on the write_stream to be `availableNow` to perform batch-like reads of cloud storage files.
        """
        raise NotImplementedError(
            "Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method and specify Trigger on the write_stream as `availableNow`"
        )

    def read_stream(self) -> DataFrame:
        """
        Performs streaming reads of files in cloud storage.
        """
        try:
            return (
                self.spark.readStream.format("cloudFiles")
                .options(**self.options)
                .load(self.path)
            )

        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK on Databricks

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/autoloader.py
106
107
108
109
110
111
112
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK on Databricks
    """
    return SystemType.PYSPARK_DATABRICKS

read_batch()

Raises:

Type Description
NotImplementedError

Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method of this component and specify the Trigger on the write_stream to be availableNow to perform batch-like reads of cloud storage files.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/autoloader.py
130
131
132
133
134
135
136
137
def read_batch(self):
    """
    Raises:
        NotImplementedError: Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method of this component and specify the Trigger on the write_stream to be `availableNow` to perform batch-like reads of cloud storage files.
    """
    raise NotImplementedError(
        "Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method and specify Trigger on the write_stream as `availableNow`"
    )

read_stream()

Performs streaming reads of files in cloud storage.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/autoloader.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
def read_stream(self) -> DataFrame:
    """
    Performs streaming reads of files in cloud storage.
    """
    try:
        return (
            self.spark.readStream.format("cloudFiles")
            .options(**self.options)
            .load(self.path)
        )

    except Exception as e:
        logging.exception(str(e))
        raise e

SparkDeltaSharingSource

Bases: SourceInterface

The Spark Delta Sharing Source is used to read data from a Delta table where Delta sharing is configured

Example

#Delta Sharing Source for Streaming Queries

from rtdip_sdk.pipelines.sources import SparkDeltaSharingSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

delta_sharing_source = SparkDeltaSharingSource(
    spark=spark,
    options={
        "maxFilesPerTrigger": 1000,
        "ignoreChanges: True,
        "startingVersion": 0
    },
    table_name="{YOUR-DELTA-TABLE-PATH}"
)

delta_sharing_source.read_stream()
#Delta Sharing Source for Batch Queries

from rtdip_sdk.pipelines.sources import SparkDeltaSharingSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

delta_sharing_source = SparkDeltaSharingSource(
    spark=spark,
    options={
        "versionAsOf": 0,
        "timestampAsOf": "yyyy-mm-dd hh:mm:ss[.fffffffff]"
    },
    table_name="{YOUR-DELTA-TABLE-PATH}"
)

delta_sharing_source.read_batch()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session required to read data from a Delta table

required
options dict

Options that can be specified for a Delta Table read operation (See Attributes table below). Further information on the options is available here

required
table_path str

Path to credentials file and Delta table to query

required

Attributes:

Name Type Description
ignoreDeletes bool str

Ignore transactions that delete data at partition boundaries. (Streaming)

ignoreChanges bool str

Pre-process updates if files had to be rewritten in the source table due to a data changing operation. (Streaming)

startingVersion int str

The Delta Lake version to start from. (Streaming)

startingTimestamp datetime str

The timestamp to start from. (Streaming)

maxFilesPerTrigger int

How many new files to be considered in every micro-batch. The default is 1000. (Streaming)

maxBytesPerTrigger int

How much data gets processed in each micro-batch. (Streaming)

readChangeFeed bool str

Stream read the change data feed of the shared table. (Batch & Streaming)

timestampAsOf datetime str

Query the Delta Table from a specific point in time. (Batch)

versionAsOf int str

Query the Delta Table from a specific version. (Batch)

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/delta_sharing.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
class SparkDeltaSharingSource(SourceInterface):
    """
    The Spark Delta Sharing Source is used to read data from a Delta table where Delta sharing is configured

    Example
    --------
    ```python
    #Delta Sharing Source for Streaming Queries

    from rtdip_sdk.pipelines.sources import SparkDeltaSharingSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    delta_sharing_source = SparkDeltaSharingSource(
        spark=spark,
        options={
            "maxFilesPerTrigger": 1000,
            "ignoreChanges: True,
            "startingVersion": 0
        },
        table_name="{YOUR-DELTA-TABLE-PATH}"
    )

    delta_sharing_source.read_stream()
    ```
    ```python
    #Delta Sharing Source for Batch Queries

    from rtdip_sdk.pipelines.sources import SparkDeltaSharingSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    delta_sharing_source = SparkDeltaSharingSource(
        spark=spark,
        options={
            "versionAsOf": 0,
            "timestampAsOf": "yyyy-mm-dd hh:mm:ss[.fffffffff]"
        },
        table_name="{YOUR-DELTA-TABLE-PATH}"
    )

    delta_sharing_source.read_batch()
    ```

    Parameters:
        spark (SparkSession): Spark Session required to read data from a Delta table
        options (dict): Options that can be specified for a Delta Table read operation (See Attributes table below). Further information on the options is available [here](https://docs.databricks.com/data-sharing/read-data-open.html#apache-spark-read-shared-data){ target="_blank" }
        table_path (str): Path to credentials file and Delta table to query

    Attributes:
        ignoreDeletes (bool str): Ignore transactions that delete data at partition boundaries. (Streaming)
        ignoreChanges (bool str): Pre-process updates if files had to be rewritten in the source table due to a data changing operation. (Streaming)
        startingVersion (int str): The Delta Lake version to start from. (Streaming)
        startingTimestamp (datetime str): The timestamp to start from. (Streaming)
        maxFilesPerTrigger (int): How many new files to be considered in every micro-batch. The default is 1000. (Streaming)
        maxBytesPerTrigger (int): How much data gets processed in each micro-batch. (Streaming)
        readChangeFeed (bool str): Stream read the change data feed of the shared table. (Batch & Streaming)
        timestampAsOf (datetime str): Query the Delta Table from a specific point in time. (Batch)
        versionAsOf (int str): Query the Delta Table from a specific version. (Batch)
    """

    spark: SparkSession
    options: dict
    table_path: str

    def __init__(self, spark: SparkSession, options: dict, table_path: str) -> None:
        self.spark = spark
        self.options = options
        self.table_path = table_path

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        libraries.add_maven_library(get_default_package("spark_delta_sharing"))
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_read_validation(self):
        return True

    def post_read_validation(self):
        return True

    def read_batch(self):
        """
        Reads batch data from Delta. Most of the options provided by the Apache Spark DataFrame read API are supported for performing batch reads on Delta tables.
        """
        try:
            return (
                self.spark.read.format("deltaSharing")
                .options(**self.options)
                .table(self.table_path)
            )

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

    def read_stream(self) -> DataFrame:
        """
        Reads streaming data from Delta. All of the data in the table is processed as well as any new data that arrives after the stream started. .load() can take table name or path.
        """
        try:
            return (
                self.spark.readStream.format("deltaSharing")
                .options(**self.options)
                .load(self.table_path)
            )

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/delta_sharing.py
 98
 99
100
101
102
103
104
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

read_batch()

Reads batch data from Delta. Most of the options provided by the Apache Spark DataFrame read API are supported for performing batch reads on Delta tables.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/delta_sharing.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def read_batch(self):
    """
    Reads batch data from Delta. Most of the options provided by the Apache Spark DataFrame read API are supported for performing batch reads on Delta tables.
    """
    try:
        return (
            self.spark.read.format("deltaSharing")
            .options(**self.options)
            .table(self.table_path)
        )

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

read_stream()

Reads streaming data from Delta. All of the data in the table is processed as well as any new data that arrives after the stream started. .load() can take table name or path.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/delta_sharing.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
def read_stream(self) -> DataFrame:
    """
    Reads streaming data from Delta. All of the data in the table is processed as well as any new data that arrives after the stream started. .load() can take table name or path.
    """
    try:
        return (
            self.spark.readStream.format("deltaSharing")
            .options(**self.options)
            .load(self.table_path)
        )

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

SparkEventhubSource

Bases: SourceInterface

This Spark source class is used to read batch or streaming data from Eventhubs. Eventhub configurations need to be specified as options in a dictionary. Additionally, there are more optional configurations which can be found here. If using startingPosition or endingPosition make sure to check out the Event Position section for more details and examples.

Example

#Eventhub Source for Streaming Queries

from rtdip_sdk.pipelines.sources import SparkEventhubSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility
import json

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

connectionString = "Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}"

startingEventPosition = {
"offset": -1,
"seqNo": -1,
"enqueuedTime": None,
"isInclusive": True
}

eventhub_source = SparkEventhubSource(
    spark=spark,
    options = {
        "eventhubs.connectionString": connectionString,
        "eventhubs.consumerGroup": "{YOUR-CONSUMER-GROUP}",
        "eventhubs.startingPosition": json.dumps(startingEventPosition),
        "maxEventsPerTrigger" : 1000
    }
)

eventhub_source.read_stream()
 #Eventhub Source for Batch Queries

from rtdip_sdk.pipelines.sources import SparkEventhubSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility
import json

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

connectionString = "Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}"

startingEventPosition = {
    "offset": -1,
    "seqNo": -1,
    "enqueuedTime": None,
    "isInclusive": True
}

endingEventPosition = {
    "offset": None,
    "seqNo": -1,
    "enqueuedTime": endTime,
    "isInclusive": True
}

eventhub_source = SparkEventhubSource(
    spark,
    options = {
        "eventhubs.connectionString": connectionString,
        "eventhubs.consumerGroup": "{YOUR-CONSUMER-GROUP}",
        "eventhubs.startingPosition": json.dumps(startingEventPosition),
        "eventhubs.endingPosition": json.dumps(endingEventPosition)
    }
)

eventhub_source.read_batch()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session

required
options dict

A dictionary of Eventhub configurations (See Attributes table below)

required

Attributes:

Name Type Description
eventhubs.connectionString str

Eventhubs connection string is required to connect to the Eventhubs service. (Streaming and Batch)

eventhubs.consumerGroup str

A consumer group is a view of an entire eventhub. Consumer groups enable multiple consuming applications to each have a separate view of the event stream, and to read the stream independently at their own pace and with their own offsets. (Streaming and Batch)

eventhubs.startingPosition JSON str

The starting position for your Structured Streaming job. If a specific EventPosition is not set for a partition using startingPositions, then we use the EventPosition set in startingPosition. If nothing is set in either option, we will begin consuming from the end of the partition. (Streaming and Batch)

eventhubs.endingPosition JSON str

(JSON str): The ending position of a batch query. This works the same as startingPosition. (Batch)

maxEventsPerTrigger long

Rate limit on maximum number of events processed per trigger interval. The specified total number of events will be proportionally split across partitions of different volume. (Stream)

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/eventhub.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
class SparkEventhubSource(SourceInterface):
    """
    This Spark source class is used to read batch or streaming data from Eventhubs. Eventhub configurations need to be specified as options in a dictionary.
    Additionally, there are more optional configurations which can be found [here.](https://github.com/Azure/azure-event-hubs-spark/blob/master/docs/PySpark/structured-streaming-pyspark.md#event-hubs-configuration){ target="_blank" }
    If using startingPosition or endingPosition make sure to check out the **Event Position** section for more details and examples.

    Example
    --------
    ```python
    #Eventhub Source for Streaming Queries

    from rtdip_sdk.pipelines.sources import SparkEventhubSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility
    import json

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    connectionString = "Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}"

    startingEventPosition = {
    "offset": -1,
    "seqNo": -1,
    "enqueuedTime": None,
    "isInclusive": True
    }

    eventhub_source = SparkEventhubSource(
        spark=spark,
        options = {
            "eventhubs.connectionString": connectionString,
            "eventhubs.consumerGroup": "{YOUR-CONSUMER-GROUP}",
            "eventhubs.startingPosition": json.dumps(startingEventPosition),
            "maxEventsPerTrigger" : 1000
        }
    )

    eventhub_source.read_stream()
    ```
    ```python
     #Eventhub Source for Batch Queries

    from rtdip_sdk.pipelines.sources import SparkEventhubSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility
    import json

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    connectionString = "Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}"

    startingEventPosition = {
        "offset": -1,
        "seqNo": -1,
        "enqueuedTime": None,
        "isInclusive": True
    }

    endingEventPosition = {
        "offset": None,
        "seqNo": -1,
        "enqueuedTime": endTime,
        "isInclusive": True
    }

    eventhub_source = SparkEventhubSource(
        spark,
        options = {
            "eventhubs.connectionString": connectionString,
            "eventhubs.consumerGroup": "{YOUR-CONSUMER-GROUP}",
            "eventhubs.startingPosition": json.dumps(startingEventPosition),
            "eventhubs.endingPosition": json.dumps(endingEventPosition)
        }
    )

    eventhub_source.read_batch()
    ```

    Parameters:
        spark (SparkSession): Spark Session
        options (dict): A dictionary of Eventhub configurations (See Attributes table below)

    Attributes:
        eventhubs.connectionString (str):  Eventhubs connection string is required to connect to the Eventhubs service. (Streaming and Batch)
        eventhubs.consumerGroup (str): A consumer group is a view of an entire eventhub. Consumer groups enable multiple consuming applications to each have a separate view of the event stream, and to read the stream independently at their own pace and with their own offsets. (Streaming and Batch)
        eventhubs.startingPosition (JSON str): The starting position for your Structured Streaming job. If a specific EventPosition is not set for a partition using startingPositions, then we use the EventPosition set in startingPosition. If nothing is set in either option, we will begin consuming from the end of the partition. (Streaming and Batch)
        eventhubs.endingPosition: (JSON str): The ending position of a batch query. This works the same as startingPosition. (Batch)
        maxEventsPerTrigger (long): Rate limit on maximum number of events processed per trigger interval. The specified total number of events will be proportionally split across partitions of different volume. (Stream)

    """

    spark: SparkSession
    options: dict

    def __init__(self, spark: SparkSession, options: dict) -> None:
        self.spark = spark
        self.options = options
        self.schema = EVENTHUB_SCHEMA

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        spark_libraries = Libraries()
        spark_libraries.add_maven_library(get_default_package("spark_azure_eventhub"))
        return spark_libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_read_validation(self) -> bool:
        return True

    def post_read_validation(self, df: DataFrame) -> bool:
        assert df.schema == self.schema
        return True

    def read_batch(self) -> DataFrame:
        """
        Reads batch data from Eventhubs.
        """
        eventhub_connection_string = "eventhubs.connectionString"
        try:
            if eventhub_connection_string in self.options:
                sc = self.spark.sparkContext
                self.options[eventhub_connection_string] = (
                    sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(
                        self.options[eventhub_connection_string]
                    )
                )

            return self.spark.read.format("eventhubs").options(**self.options).load()

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

    def read_stream(self) -> DataFrame:
        """
        Reads streaming data from Eventhubs.
        """
        eventhub_connection_string = "eventhubs.connectionString"
        try:
            if eventhub_connection_string in self.options:
                sc = self.spark.sparkContext
                self.options[eventhub_connection_string] = (
                    sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(
                        self.options[eventhub_connection_string]
                    )
                )

            return (
                self.spark.readStream.format("eventhubs").options(**self.options).load()
            )

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/eventhub.py
124
125
126
127
128
129
130
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

read_batch()

Reads batch data from Eventhubs.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/eventhub.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
def read_batch(self) -> DataFrame:
    """
    Reads batch data from Eventhubs.
    """
    eventhub_connection_string = "eventhubs.connectionString"
    try:
        if eventhub_connection_string in self.options:
            sc = self.spark.sparkContext
            self.options[eventhub_connection_string] = (
                sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(
                    self.options[eventhub_connection_string]
                )
            )

        return self.spark.read.format("eventhubs").options(**self.options).load()

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

read_stream()

Reads streaming data from Eventhubs.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/eventhub.py
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
def read_stream(self) -> DataFrame:
    """
    Reads streaming data from Eventhubs.
    """
    eventhub_connection_string = "eventhubs.connectionString"
    try:
        if eventhub_connection_string in self.options:
            sc = self.spark.sparkContext
            self.options[eventhub_connection_string] = (
                sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(
                    self.options[eventhub_connection_string]
                )
            )

        return (
            self.spark.readStream.format("eventhubs").options(**self.options).load()
        )

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

SparkIoThubSource

Bases: SourceInterface

This Spark source class is used to read batch or streaming data from an IoT Hub. IoT Hub configurations need to be specified as options in a dictionary. Additionally, there are more optional configurations which can be found here. If using startingPosition or endingPosition make sure to check out the Event Position section for more details and examples.

Example

#IoT Hub Source for Streaming Queries

from rtdip_sdk.pipelines.sources import SparkIoThubSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility
import json

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

connectionString = "Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}"

startingEventPosition = {
"offset": -1,
"seqNo": -1,
"enqueuedTime": None,
"isInclusive": True
}

iot_hub_source = SparkIoThubSource(
    spark=spark,
    options = {
        "eventhubs.connectionString": connectionString,
        "eventhubs.consumerGroup": "{YOUR-CONSUMER-GROUP}",
        "eventhubs.startingPosition": json.dumps(startingEventPosition),
        "maxEventsPerTrigger" : 1000
    }
)

iot_hub_source.read_stream()
 #IoT Hub Source for Batch Queries

from rtdip_sdk.pipelines.sources import SparkIoThubSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility
import json

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

connectionString = "Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}"

startingEventPosition = {
    "offset": -1,
    "seqNo": -1,
    "enqueuedTime": None,
    "isInclusive": True
}

endingEventPosition = {
    "offset": None,
    "seqNo": -1,
    "enqueuedTime": endTime,
    "isInclusive": True
}

iot_hub_source = SparkIoThubSource(
    spark,
    options = {
        "eventhubs.connectionString": connectionString,
        "eventhubs.consumerGroup": "{YOUR-CONSUMER-GROUP}",
        "eventhubs.startingPosition": json.dumps(startingEventPosition),
        "eventhubs.endingPosition": json.dumps(endingEventPosition)
    }
)

iot_hub_source.read_batch()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session

required
options dict

A dictionary of IoT Hub configurations (See Attributes table below)

required

Attributes:

Name Type Description
eventhubs.connectionString str

IoT Hub connection string is required to connect to the Eventhubs service. (Streaming and Batch)

eventhubs.consumerGroup str

A consumer group is a view of an entire IoT Hub. Consumer groups enable multiple consuming applications to each have a separate view of the event stream, and to read the stream independently at their own pace and with their own offsets. (Streaming and Batch)

eventhubs.startingPosition JSON str

The starting position for your Structured Streaming job. If a specific EventPosition is not set for a partition using startingPositions, then we use the EventPosition set in startingPosition. If nothing is set in either option, we will begin consuming from the end of the partition. (Streaming and Batch)

eventhubs.endingPosition JSON str

(JSON str): The ending position of a batch query. This works the same as startingPosition. (Batch)

maxEventsPerTrigger long

Rate limit on maximum number of events processed per trigger interval. The specified total number of events will be proportionally split across partitions of different volume. (Stream)

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iot_hub.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
class SparkIoThubSource(SourceInterface):
    """
    This Spark source class is used to read batch or streaming data from an IoT Hub. IoT Hub configurations need to be specified as options in a dictionary.
    Additionally, there are more optional configurations which can be found [here.](https://github.com/Azure/azure-event-hubs-spark/blob/master/docs/PySpark/structured-streaming-pyspark.md#event-hubs-configuration){ target="_blank" }
    If using startingPosition or endingPosition make sure to check out the **Event Position** section for more details and examples.

    Example
    --------
    ```python
    #IoT Hub Source for Streaming Queries

    from rtdip_sdk.pipelines.sources import SparkIoThubSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility
    import json

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    connectionString = "Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}"

    startingEventPosition = {
    "offset": -1,
    "seqNo": -1,
    "enqueuedTime": None,
    "isInclusive": True
    }

    iot_hub_source = SparkIoThubSource(
        spark=spark,
        options = {
            "eventhubs.connectionString": connectionString,
            "eventhubs.consumerGroup": "{YOUR-CONSUMER-GROUP}",
            "eventhubs.startingPosition": json.dumps(startingEventPosition),
            "maxEventsPerTrigger" : 1000
        }
    )

    iot_hub_source.read_stream()
    ```
    ```python
     #IoT Hub Source for Batch Queries

    from rtdip_sdk.pipelines.sources import SparkIoThubSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility
    import json

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    connectionString = "Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}"

    startingEventPosition = {
        "offset": -1,
        "seqNo": -1,
        "enqueuedTime": None,
        "isInclusive": True
    }

    endingEventPosition = {
        "offset": None,
        "seqNo": -1,
        "enqueuedTime": endTime,
        "isInclusive": True
    }

    iot_hub_source = SparkIoThubSource(
        spark,
        options = {
            "eventhubs.connectionString": connectionString,
            "eventhubs.consumerGroup": "{YOUR-CONSUMER-GROUP}",
            "eventhubs.startingPosition": json.dumps(startingEventPosition),
            "eventhubs.endingPosition": json.dumps(endingEventPosition)
        }
    )

    iot_hub_source.read_batch()
    ```

    Parameters:
        spark (SparkSession): Spark Session
        options (dict): A dictionary of IoT Hub configurations (See Attributes table below)

    Attributes:
        eventhubs.connectionString (str):  IoT Hub connection string is required to connect to the Eventhubs service. (Streaming and Batch)
        eventhubs.consumerGroup (str): A consumer group is a view of an entire IoT Hub. Consumer groups enable multiple consuming applications to each have a separate view of the event stream, and to read the stream independently at their own pace and with their own offsets. (Streaming and Batch)
        eventhubs.startingPosition (JSON str): The starting position for your Structured Streaming job. If a specific EventPosition is not set for a partition using startingPositions, then we use the EventPosition set in startingPosition. If nothing is set in either option, we will begin consuming from the end of the partition. (Streaming and Batch)
        eventhubs.endingPosition: (JSON str): The ending position of a batch query. This works the same as startingPosition. (Batch)
        maxEventsPerTrigger (long): Rate limit on maximum number of events processed per trigger interval. The specified total number of events will be proportionally split across partitions of different volume. (Stream)

    """

    options: dict
    spark: SparkSession

    def __init__(self, spark: SparkSession, options: dict) -> None:
        self.spark = spark
        self.schema = EVENTHUB_SCHEMA
        self.options = options

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def settings() -> dict:
        return {}

    @staticmethod
    def libraries():
        spark_libraries = Libraries()
        spark_libraries.add_maven_library(get_default_package("spark_azure_eventhub"))
        return spark_libraries

    def pre_read_validation(self) -> bool:
        return True

    def post_read_validation(self, df: DataFrame) -> bool:
        assert df.schema == self.schema
        return True

    def read_batch(self) -> DataFrame:
        """
        Reads batch data from IoT Hubs.
        """
        iothub_connection_string = "eventhubs.connectionString"
        try:
            if iothub_connection_string in self.options:
                sc = self.spark.sparkContext
                self.options[iothub_connection_string] = (
                    sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(
                        self.options[iothub_connection_string]
                    )
                )

            return self.spark.read.format("eventhubs").options(**self.options).load()

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

    def read_stream(self) -> DataFrame:
        """
        Reads streaming data from IoT Hubs.
        """
        iothub_connection_string = "eventhubs.connectionString"
        try:
            if iothub_connection_string in self.options:
                sc = self.spark.sparkContext
                self.options[iothub_connection_string] = (
                    sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(
                        self.options[iothub_connection_string]
                    )
                )

            return (
                self.spark.readStream.format("eventhubs").options(**self.options).load()
            )

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iot_hub.py
124
125
126
127
128
129
130
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

read_batch()

Reads batch data from IoT Hubs.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iot_hub.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
def read_batch(self) -> DataFrame:
    """
    Reads batch data from IoT Hubs.
    """
    iothub_connection_string = "eventhubs.connectionString"
    try:
        if iothub_connection_string in self.options:
            sc = self.spark.sparkContext
            self.options[iothub_connection_string] = (
                sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(
                    self.options[iothub_connection_string]
                )
            )

        return self.spark.read.format("eventhubs").options(**self.options).load()

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

read_stream()

Reads streaming data from IoT Hubs.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iot_hub.py
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
def read_stream(self) -> DataFrame:
    """
    Reads streaming data from IoT Hubs.
    """
    iothub_connection_string = "eventhubs.connectionString"
    try:
        if iothub_connection_string in self.options:
            sc = self.spark.sparkContext
            self.options[iothub_connection_string] = (
                sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(
                    self.options[iothub_connection_string]
                )
            )

        return (
            self.spark.readStream.format("eventhubs").options(**self.options).load()
        )

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

SparkKafkaSource

Bases: SourceInterface

This Spark source class is used to read batch or streaming data from Kafka. Required and optional configurations can be found in the Attributes tables below.

Additionally, there are more optional configurations which can be found here.

Example

 #Kafka Source for Streaming Queries

from rtdip_sdk.pipelines.sources import SparkKafkaSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

kafka_source = SparkKafkaSource(
    spark=spark,
    options={
        "kafka.bootstrap.servers": "{HOST_1}:{PORT_1},{HOST_2}:{PORT_2}",
        "subscribe": "{TOPIC_1},{TOPIC_2}",
        "includeHeaders", "true"
    }
)

kafka_source.read_stream()
 #Kafka Source for Batch Queries

from rtdip_sdk.pipelines.sources import SparkKafkaSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

kafka_source = SparkKafkaSource(
    spark=spark,
    options={
        "kafka.bootstrap.servers": "{HOST_1}:{PORT_1},{HOST_2}:{PORT_2}",
        "subscribe": "{TOPIC_1},{TOPIC_2}",
        "startingOffsets": "earliest",
        "endingOffsets": "latest"
    }
)

kafka_source.read_batch()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session

required
options dict

A dictionary of Kafka configurations (See Attributes tables below). For more information on configuration options see here

required

The following attributes are the most common configurations for Kafka.

The only configuration that must be set for the Kafka source for both batch and streaming queries is listed below.

Attributes:

Name Type Description
kafka.bootstrap.servers A comma-separated list of host︰port

The Kafka "bootstrap.servers" configuration. (Streaming and Batch)

There are multiple ways of specifying which topics to subscribe to. You should provide only one of these attributes:

Attributes:

Name Type Description
assign json string {"topicA"︰[0,1],"topicB"︰[2,4]}

Specific TopicPartitions to consume. Only one of "assign", "subscribe" or "subscribePattern" options can be specified for Kafka source. (Streaming and Batch)

subscribe A comma-separated list of topics

The topic list to subscribe. Only one of "assign", "subscribe" or "subscribePattern" options can be specified for Kafka source. (Streaming and Batch)

subscribePattern Java regex string

The pattern used to subscribe to topic(s). Only one of "assign, "subscribe" or "subscribePattern" options can be specified for Kafka source. (Streaming and Batch)

The following configurations are optional:

Attributes:

Name Type Description
startingTimestamp timestamp str

The start point of timestamp when a query is started, a string specifying a starting timestamp for all partitions in topics being subscribed. Please refer the note on starting timestamp offset options below. (Streaming and Batch)

startingOffsetsByTimestamp JSON str

The start point of timestamp when a query is started, a json string specifying a starting timestamp for each TopicPartition. Please refer the note on starting timestamp offset options below. (Streaming and Batch)

startingOffsets "earliest", "latest" (streaming only), or JSON string

The start point when a query is started, either "earliest" which is from the earliest offsets, "latest" which is just from the latest offsets, or a json string specifying a starting offset for each TopicPartition. In the json, -2 as an offset can be used to refer to earliest, -1 to latest.

endingTimestamp timestamp str

The end point when a batch query is ended, a json string specifying an ending timestamp for all partitions in topics being subscribed. Please refer the note on ending timestamp offset options below. (Batch)

endingOffsetsByTimestamp JSON str

The end point when a batch query is ended, a json string specifying an ending timestamp for each TopicPartition. Please refer the note on ending timestamp offset options below. (Batch)

endingOffsets latest or JSON str

The end point when a batch query is ended, either "latest" which is just referred to the latest, or a json string specifying an ending offset for each TopicPartition. In the json, -1 as an offset can be used to refer to latest, and -2 (earliest) as an offset is not allowed. (Batch)

maxOffsetsPerTrigger long

Rate limit on maximum number of offsets processed per trigger interval. The specified total number of offsets will be proportionally split across topicPartitions of different volume. (Streaming)

minOffsetsPerTrigger long

Minimum number of offsets to be processed per trigger interval. The specified total number of offsets will be proportionally split across topicPartitions of different volume. (Streaming)

failOnDataLoss bool

Whether to fail the query when it's possible that data is lost (e.g., topics are deleted, or offsets are out of range). This may be a false alarm. You can disable it when it doesn't work as you expected.

minPartitions int

Desired minimum number of partitions to read from Kafka. By default, Spark has a 1-1 mapping of topicPartitions to Spark partitions consuming from Kafka. (Streaming and Batch)

includeHeaders bool

Whether to include the Kafka headers in the row. (Streaming and Batch)

Starting Timestamp Offset Note

If Kafka doesn't return the matched offset, the behavior will follow to the value of the option startingOffsetsByTimestampStrategy.

startingTimestamp takes precedence over startingOffsetsByTimestamp and startingOffsets.

For streaming queries, this only applies when a new query is started, and that resuming will always pick up from where the query left off. Newly discovered partitions during a query will start at earliest.

Ending Timestamp Offset Note

If Kafka doesn't return the matched offset, the offset will be set to latest.

endingOffsetsByTimestamp takes precedence over endingOffsets.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/kafka.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
class SparkKafkaSource(SourceInterface):
    """
    This Spark source class is used to read batch or streaming data from Kafka. Required and optional configurations can be found in the Attributes tables below.

    Additionally, there are more optional configurations which can be found [here.](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html){ target="_blank" }

    Example
    --------
    ```python
     #Kafka Source for Streaming Queries

    from rtdip_sdk.pipelines.sources import SparkKafkaSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    kafka_source = SparkKafkaSource(
        spark=spark,
        options={
            "kafka.bootstrap.servers": "{HOST_1}:{PORT_1},{HOST_2}:{PORT_2}",
            "subscribe": "{TOPIC_1},{TOPIC_2}",
            "includeHeaders", "true"
        }
    )

    kafka_source.read_stream()
    ```
    ```python
     #Kafka Source for Batch Queries

    from rtdip_sdk.pipelines.sources import SparkKafkaSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    kafka_source = SparkKafkaSource(
        spark=spark,
        options={
            "kafka.bootstrap.servers": "{HOST_1}:{PORT_1},{HOST_2}:{PORT_2}",
            "subscribe": "{TOPIC_1},{TOPIC_2}",
            "startingOffsets": "earliest",
            "endingOffsets": "latest"
        }
    )

    kafka_source.read_batch()
    ```

    Parameters:
        spark (SparkSession): Spark Session
        options (dict): A dictionary of Kafka configurations (See Attributes tables below). For more information on configuration options see [here](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html){ target="_blank" }

    The following attributes are the most common configurations for Kafka.

    The only configuration that must be set for the Kafka source for both batch and streaming queries is listed below.

    Attributes:
        kafka.bootstrap.servers (A comma-separated list of host︰port):  The Kafka "bootstrap.servers" configuration. (Streaming and Batch)

    There are multiple ways of specifying which topics to subscribe to. You should provide only one of these attributes:

    Attributes:
        assign (json string {"topicA"︰[0,1],"topicB"︰[2,4]}):  Specific TopicPartitions to consume. Only one of "assign", "subscribe" or "subscribePattern" options can be specified for Kafka source. (Streaming and Batch)
        subscribe (A comma-separated list of topics): The topic list to subscribe. Only one of "assign", "subscribe" or "subscribePattern" options can be specified for Kafka source. (Streaming and Batch)
        subscribePattern (Java regex string): The pattern used to subscribe to topic(s). Only one of "assign, "subscribe" or "subscribePattern" options can be specified for Kafka source. (Streaming and Batch)

    The following configurations are optional:

    Attributes:
        startingTimestamp (timestamp str): The start point of timestamp when a query is started, a string specifying a starting timestamp for all partitions in topics being subscribed. Please refer the note on starting timestamp offset options below. (Streaming and Batch)
        startingOffsetsByTimestamp (JSON str): The start point of timestamp when a query is started, a json string specifying a starting timestamp for each TopicPartition. Please refer the note on starting timestamp offset options below. (Streaming and Batch)
        startingOffsets ("earliest", "latest" (streaming only), or JSON string): The start point when a query is started, either "earliest" which is from the earliest offsets, "latest" which is just from the latest offsets, or a json string specifying a starting offset for each TopicPartition. In the json, -2 as an offset can be used to refer to earliest, -1 to latest.
        endingTimestamp (timestamp str): The end point when a batch query is ended, a json string specifying an ending timestamp for all partitions in topics being subscribed. Please refer the note on ending timestamp offset options below. (Batch)
        endingOffsetsByTimestamp (JSON str): The end point when a batch query is ended, a json string specifying an ending timestamp for each TopicPartition. Please refer the note on ending timestamp offset options below. (Batch)
        endingOffsets (latest or JSON str): The end point when a batch query is ended, either "latest" which is just referred to the latest, or a json string specifying an ending offset for each TopicPartition. In the json, -1 as an offset can be used to refer to latest, and -2 (earliest) as an offset is not allowed. (Batch)
        maxOffsetsPerTrigger (long): Rate limit on maximum number of offsets processed per trigger interval. The specified total number of offsets will be proportionally split across topicPartitions of different volume. (Streaming)
        minOffsetsPerTrigger (long): Minimum number of offsets to be processed per trigger interval. The specified total number of offsets will be proportionally split across topicPartitions of different volume. (Streaming)
        failOnDataLoss (bool): Whether to fail the query when it's possible that data is lost (e.g., topics are deleted, or offsets are out of range). This may be a false alarm. You can disable it when it doesn't work as you expected.
        minPartitions (int): Desired minimum number of partitions to read from Kafka. By default, Spark has a 1-1 mapping of topicPartitions to Spark partitions consuming from Kafka. (Streaming and Batch)
        includeHeaders (bool): Whether to include the Kafka headers in the row. (Streaming and Batch)

    !!! note "Starting Timestamp Offset Note"
        If Kafka doesn't return the matched offset, the behavior will follow to the value of the option <code>startingOffsetsByTimestampStrategy</code>.

        <code>startingTimestamp</code> takes precedence over <code>startingOffsetsByTimestamp</code> and </code>startingOffsets</code>.

        For streaming queries, this only applies when a new query is started, and that resuming will always pick up from where the query left off. Newly discovered partitions during a query will start at earliest.

    !!! note "Ending Timestamp Offset Note"
        If Kafka doesn't return the matched offset, the offset will be set to latest.

        <code>endingOffsetsByTimestamp</code> takes precedence over <code>endingOffsets</code>.

    """

    spark: SparkSession
    options: dict

    def __init__(self, spark: SparkSession, options: dict) -> None:
        self.spark = spark
        self.options = options
        self.schema = KAFKA_SCHEMA

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        spark_libraries = Libraries()
        spark_libraries.add_maven_library(get_default_package("spark_sql_kafka"))
        return spark_libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_read_validation(self) -> bool:
        return True

    def post_read_validation(self, df: DataFrame) -> bool:
        assert df.schema == self.schema
        return True

    def read_batch(self) -> DataFrame:
        """
        Reads batch data from Kafka.
        """
        try:
            return self.spark.read.format("kafka").options(**self.options).load()

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

    def read_stream(self) -> DataFrame:
        """
        Reads streaming data from Kafka.
        """
        try:
            return self.spark.readStream.format("kafka").options(**self.options).load()

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/kafka.py
130
131
132
133
134
135
136
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

read_batch()

Reads batch data from Kafka.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/kafka.py
155
156
157
158
159
160
161
162
163
164
165
166
167
def read_batch(self) -> DataFrame:
    """
    Reads batch data from Kafka.
    """
    try:
        return self.spark.read.format("kafka").options(**self.options).load()

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

read_stream()

Reads streaming data from Kafka.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/kafka.py
169
170
171
172
173
174
175
176
177
178
179
180
181
def read_stream(self) -> DataFrame:
    """
    Reads streaming data from Kafka.
    """
    try:
        return self.spark.readStream.format("kafka").options(**self.options).load()

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

SparkKafkaEventhubSource

Bases: SourceInterface

This Spark source class is used to read batch or streaming data from an Eventhub using the Kafka protocol. This enables Eventhubs to be used as a source in applications like Delta Live Tables or Databricks Serverless Jobs as the Spark Eventhubs JAR is not supported in these scenarios.

The dataframe returned is transformed to ensure the schema is as close to the Eventhub Spark source as possible. There are some minor differences:

  • offset is dependent on x-opt-offset being populated in the headers provided. If this is not found in the headers, the value will be null
  • publisher is dependent on x-opt-publisher being populated in the headers provided. If this is not found in the headers, the value will be null
  • partitionKey is dependent on x-opt-partition-key being populated in the headers provided. If this is not found in the headers, the value will be null
  • systemProperties are identified according to the list provided in the Eventhub documentation and IoT Hub documentation

Default settings will be specified if not provided in the options parameter:

  • kafka.sasl.mechanism will be set to PLAIN
  • kafka.security.protocol will be set to SASL_SSL
  • kafka.request.timeout.ms will be set to 60000
  • kafka.session.timeout.ms will be set to 60000

Examples

#Kafka Source for Streaming Queries

from rtdip_sdk.pipelines.sources import SparkKafkaEventhubSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

connectionString = "Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}"
consumerGroup = "{YOUR-CONSUMER-GROUP}"

kafka_eventhub_source = SparkKafkaEventhubSource(
    spark=spark,
    options={
        "startingOffsets": "earliest",
        "maxOffsetsPerTrigger": 10000,
        "failOnDataLoss": "false",
    },
    connection_string=connectionString,
    consumer_group="consumerGroup"
)

kafka_eventhub_source.read_stream()
#Kafka Source for Batch Queries

from rtdip_sdk.pipelines.sources import SparkKafkaEventhubSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

connectionString = "Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}"
consumerGroup = "{YOUR-CONSUMER-GROUP}"

kafka_eventhub_source = SparkKafkaEventhubSource(
    spark=spark,
    options={
        "startingOffsets": "earliest",
        "endingOffsets": "latest",
        "failOnDataLoss": "false"
    },
    connection_string=connectionString,
    consumer_group="consumerGroup"
)

kafka_eventhub_source.read_batch()

Required and optional configurations can be found in the Attributes and Parameter tables below. Additionally, there are more optional configurations which can be found here.

Parameters:

Name Type Description Default
spark SparkSession

Spark Session

required
options dict

A dictionary of Kafka configurations (See Attributes tables below). For more information on configuration options see here

required
connection_string str

Eventhubs connection string is required to connect to the Eventhubs service. This must include the Eventhub name as the EntityPath parameter. Example "Endpoint=sb://test.servicebus.windows.net/;SharedAccessKeyName=test;SharedAccessKey=test_key;EntityPath=test_eventhub"

required
consumer_group str

The Eventhub consumer group to use for the connection

required
decode_kafka_headers_to_amqp_properties optional bool

Perform decoding of Kafka headers into their AMQP properties. Default is True

True

The only configuration that must be set for the Kafka source for both batch and streaming queries is listed below.

Attributes:

Name Type Description
kafka.bootstrap.servers A comma-separated list of host︰port

The Kafka "bootstrap.servers" configuration. (Streaming and Batch)

There are multiple ways of specifying which topics to subscribe to. You should provide only one of these parameters:

Attributes:

Name Type Description
assign json string {"topicA"︰[0,1],"topicB"︰[2,4]}

Specific TopicPartitions to consume. Only one of "assign", "subscribe" or "subscribePattern" options can be specified for Kafka source. (Streaming and Batch)

subscribe A comma-separated list of topics

The topic list to subscribe. Only one of "assign", "subscribe" or "subscribePattern" options can be specified for Kafka source. (Streaming and Batch)

subscribePattern Java regex string

The pattern used to subscribe to topic(s). Only one of "assign, "subscribe" or "subscribePattern" options can be specified for Kafka source. (Streaming and Batch)

The following configurations are optional:

Attributes:

Name Type Description
startingTimestamp timestamp str

The start point of timestamp when a query is started, a string specifying a starting timestamp for all partitions in topics being subscribed. Please refer the note on starting timestamp offset options below. (Streaming and Batch)

startingOffsetsByTimestamp JSON str

The start point of timestamp when a query is started, a json string specifying a starting timestamp for each TopicPartition. Please refer the note on starting timestamp offset options below. (Streaming and Batch)

startingOffsets "earliest", "latest" (streaming only), or JSON string

The start point when a query is started, either "earliest" which is from the earliest offsets, "latest" which is just from the latest offsets, or a json string specifying a starting offset for each TopicPartition. In the json, -2 as an offset can be used to refer to earliest, -1 to latest.

endingTimestamp timestamp str

The end point when a batch query is ended, a json string specifying an ending timestamp for all partitions in topics being subscribed. Please refer the note on ending timestamp offset options below. (Batch)

endingOffsetsByTimestamp JSON str

The end point when a batch query is ended, a json string specifying an ending timestamp for each TopicPartition. Please refer the note on ending timestamp offset options below. (Batch)

endingOffsets latest or JSON str

The end point when a batch query is ended, either "latest" which is just referred to the latest, or a json string specifying an ending offset for each TopicPartition. In the json, -1 as an offset can be used to refer to latest, and -2 (earliest) as an offset is not allowed. (Batch)

maxOffsetsPerTrigger long

Rate limit on maximum number of offsets processed per trigger interval. The specified total number of offsets will be proportionally split across topicPartitions of different volume. (Streaming)

minOffsetsPerTrigger long

Minimum number of offsets to be processed per trigger interval. The specified total number of offsets will be proportionally split across topicPartitions of different volume. (Streaming)

failOnDataLoss bool

Whether to fail the query when it's possible that data is lost (e.g., topics are deleted, or offsets are out of range). This may be a false alarm. You can disable it when it doesn't work as you expected.

minPartitions int

Desired minimum number of partitions to read from Kafka. By default, Spark has a 1-1 mapping of topicPartitions to Spark partitions consuming from Kafka. (Streaming and Batch)

includeHeaders bool

Whether to include the Kafka headers in the row. (Streaming and Batch)

Starting Timestamp Offset Note

If Kafka doesn't return the matched offset, the behavior will follow to the value of the option startingOffsetsByTimestampStrategy.

startingTimestamp takes precedence over startingOffsetsByTimestamp and startingOffsets.

For streaming queries, this only applies when a new query is started, and that resuming will always pick up from where the query left off. Newly discovered partitions during a query will start at earliest.

Ending Timestamp Offset Note

If Kafka doesn't return the matched offset, the offset will be set to latest.

endingOffsetsByTimestamp takes precedence over endingOffsets.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/kafka_eventhub.py
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
class SparkKafkaEventhubSource(SourceInterface):
    """
    This Spark source class is used to read batch or streaming data from an Eventhub using the Kafka protocol. This enables Eventhubs to be used as a source in applications like Delta Live Tables or Databricks Serverless Jobs as the Spark Eventhubs JAR is not supported in these scenarios.

    The dataframe returned is transformed to ensure the schema is as close to the Eventhub Spark source as possible. There are some minor differences:

    - `offset` is dependent on `x-opt-offset` being populated in the headers provided. If this is not found in the headers, the value will be null
    - `publisher` is dependent on `x-opt-publisher` being populated in the headers provided. If this is not found in the headers, the value will be null
    - `partitionKey` is dependent on `x-opt-partition-key` being populated in the headers provided. If this is not found in the headers, the value will be null
    - `systemProperties` are identified according to the list provided in the [Eventhub documentation](https://learn.microsoft.com/en-us/azure/data-explorer/ingest-data-event-hub-overview#event-system-properties-mapping){ target="_blank" } and [IoT Hub documentation](https://learn.microsoft.com/en-us/azure/data-explorer/ingest-data-iot-hub-overview#event-system-properties-mapping){ target="_blank" }

    Default settings will be specified if not provided in the `options` parameter:

    - `kafka.sasl.mechanism` will be set to `PLAIN`
    - `kafka.security.protocol` will be set to `SASL_SSL`
    - `kafka.request.timeout.ms` will be set to `60000`
    - `kafka.session.timeout.ms` will be set to `60000`

    Examples
    --------
    ```python
    #Kafka Source for Streaming Queries

    from rtdip_sdk.pipelines.sources import SparkKafkaEventhubSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    connectionString = "Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}"
    consumerGroup = "{YOUR-CONSUMER-GROUP}"

    kafka_eventhub_source = SparkKafkaEventhubSource(
        spark=spark,
        options={
            "startingOffsets": "earliest",
            "maxOffsetsPerTrigger": 10000,
            "failOnDataLoss": "false",
        },
        connection_string=connectionString,
        consumer_group="consumerGroup"
    )

    kafka_eventhub_source.read_stream()
    ```
    ```python
    #Kafka Source for Batch Queries

    from rtdip_sdk.pipelines.sources import SparkKafkaEventhubSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    connectionString = "Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}"
    consumerGroup = "{YOUR-CONSUMER-GROUP}"

    kafka_eventhub_source = SparkKafkaEventhubSource(
        spark=spark,
        options={
            "startingOffsets": "earliest",
            "endingOffsets": "latest",
            "failOnDataLoss": "false"
        },
        connection_string=connectionString,
        consumer_group="consumerGroup"
    )

    kafka_eventhub_source.read_batch()
    ```

    Required and optional configurations can be found in the Attributes and Parameter tables below.
    Additionally, there are more optional configurations which can be found [here.](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html){ target="_blank" }

    Parameters:
        spark (SparkSession): Spark Session
        options (dict): A dictionary of Kafka configurations (See Attributes tables below). For more information on configuration options see [here](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html){ target="_blank" }
        connection_string (str): Eventhubs connection string is required to connect to the Eventhubs service. This must include the Eventhub name as the `EntityPath` parameter. Example `"Endpoint=sb://test.servicebus.windows.net/;SharedAccessKeyName=test;SharedAccessKey=test_key;EntityPath=test_eventhub"`
        consumer_group (str): The Eventhub consumer group to use for the connection
        decode_kafka_headers_to_amqp_properties (optional bool): Perform decoding of Kafka headers into their AMQP properties. Default is True

    The only configuration that must be set for the Kafka source for both batch and streaming queries is listed below.

    Attributes:
        kafka.bootstrap.servers (A comma-separated list of host︰port):  The Kafka "bootstrap.servers" configuration. (Streaming and Batch)

    There are multiple ways of specifying which topics to subscribe to. You should provide only one of these parameters:

    Attributes:
        assign (json string {"topicA"︰[0,1],"topicB"︰[2,4]}):  Specific TopicPartitions to consume. Only one of "assign", "subscribe" or "subscribePattern" options can be specified for Kafka source. (Streaming and Batch)
        subscribe (A comma-separated list of topics): The topic list to subscribe. Only one of "assign", "subscribe" or "subscribePattern" options can be specified for Kafka source. (Streaming and Batch)
        subscribePattern (Java regex string): The pattern used to subscribe to topic(s). Only one of "assign, "subscribe" or "subscribePattern" options can be specified for Kafka source. (Streaming and Batch)

    The following configurations are optional:

    Attributes:
        startingTimestamp (timestamp str): The start point of timestamp when a query is started, a string specifying a starting timestamp for all partitions in topics being subscribed. Please refer the note on starting timestamp offset options below. (Streaming and Batch)
        startingOffsetsByTimestamp (JSON str): The start point of timestamp when a query is started, a json string specifying a starting timestamp for each TopicPartition. Please refer the note on starting timestamp offset options below. (Streaming and Batch)
        startingOffsets ("earliest", "latest" (streaming only), or JSON string): The start point when a query is started, either "earliest" which is from the earliest offsets, "latest" which is just from the latest offsets, or a json string specifying a starting offset for each TopicPartition. In the json, -2 as an offset can be used to refer to earliest, -1 to latest.
        endingTimestamp (timestamp str): The end point when a batch query is ended, a json string specifying an ending timestamp for all partitions in topics being subscribed. Please refer the note on ending timestamp offset options below. (Batch)
        endingOffsetsByTimestamp (JSON str): The end point when a batch query is ended, a json string specifying an ending timestamp for each TopicPartition. Please refer the note on ending timestamp offset options below. (Batch)
        endingOffsets (latest or JSON str): The end point when a batch query is ended, either "latest" which is just referred to the latest, or a json string specifying an ending offset for each TopicPartition. In the json, -1 as an offset can be used to refer to latest, and -2 (earliest) as an offset is not allowed. (Batch)
        maxOffsetsPerTrigger (long): Rate limit on maximum number of offsets processed per trigger interval. The specified total number of offsets will be proportionally split across topicPartitions of different volume. (Streaming)
        minOffsetsPerTrigger (long): Minimum number of offsets to be processed per trigger interval. The specified total number of offsets will be proportionally split across topicPartitions of different volume. (Streaming)
        failOnDataLoss (bool): Whether to fail the query when it's possible that data is lost (e.g., topics are deleted, or offsets are out of range). This may be a false alarm. You can disable it when it doesn't work as you expected.
        minPartitions (int): Desired minimum number of partitions to read from Kafka. By default, Spark has a 1-1 mapping of topicPartitions to Spark partitions consuming from Kafka. (Streaming and Batch)
        includeHeaders (bool): Whether to include the Kafka headers in the row. (Streaming and Batch)

    !!! note "Starting Timestamp Offset Note"
        If Kafka doesn't return the matched offset, the behavior will follow to the value of the option <code>startingOffsetsByTimestampStrategy</code>.

        <code>startingTimestamp</code> takes precedence over <code>startingOffsetsByTimestamp</code> and </code>startingOffsets</code>.

        For streaming queries, this only applies when a new query is started, and that resuming will always pick up from where the query left off. Newly discovered partitions during a query will start at earliest.

    !!! note "Ending Timestamp Offset Note"
        If Kafka doesn't return the matched offset, the offset will be set to latest.

        <code>endingOffsetsByTimestamp</code> takes precedence over <code>endingOffsets</code>.

    """

    def __init__(
        self,
        spark: SparkSession,
        options: dict,
        connection_string: str,
        consumer_group: str,
        decode_kafka_headers_to_amqp_properties: bool = True,
    ) -> None:
        self.spark = spark
        self.options = options
        self.connection_string = connection_string
        self.consumer_group = consumer_group
        self.decode_kafka_headers_to_amqp_properties = (
            decode_kafka_headers_to_amqp_properties
        )
        self.connection_string_properties = self._parse_connection_string(
            connection_string
        )
        self.schema = KAFKA_EVENTHUB_SCHEMA
        self.options = self._configure_options(options)

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        spark_libraries = Libraries()
        spark_libraries.add_maven_library(get_default_package("spark_sql_kafka"))
        return spark_libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_read_validation(self) -> bool:
        return True

    def post_read_validation(self, df: DataFrame) -> bool:
        assert df.schema == self.schema
        return True

    # Code is from Azure Eventhub Python SDK. Will import the package if possible with Conda in the  conda-forge channel in the future
    def _parse_connection_string(self, connection_string: str):
        conn_settings = [s.split("=", 1) for s in connection_string.split(";")]
        if any(len(tup) != 2 for tup in conn_settings):
            raise ValueError("Connection string is either blank or malformed.")
        conn_settings = dict(conn_settings)
        shared_access_signature = None
        for key, value in conn_settings.items():
            if key.lower() == "sharedaccesssignature":
                shared_access_signature = value
        shared_access_key = conn_settings.get("SharedAccessKey")
        shared_access_key_name = conn_settings.get("SharedAccessKeyName")
        if any([shared_access_key, shared_access_key_name]) and not all(
            [shared_access_key, shared_access_key_name]
        ):
            raise ValueError(
                "Connection string must have both SharedAccessKeyName and SharedAccessKey."
            )
        if shared_access_signature is not None and shared_access_key is not None:
            raise ValueError(
                "Only one of the SharedAccessKey or SharedAccessSignature must be present."
            )
        endpoint = conn_settings.get("Endpoint")
        if not endpoint:
            raise ValueError("Connection string is either blank or malformed.")
        parsed = urlparse(endpoint.rstrip("/"))
        if not parsed.netloc:
            raise ValueError("Invalid Endpoint on the Connection String.")
        namespace = parsed.netloc.strip()
        properties = {
            "fully_qualified_namespace": namespace,
            "endpoint": endpoint,
            "eventhub_name": conn_settings.get("EntityPath"),
            "shared_access_signature": shared_access_signature,
            "shared_access_key_name": shared_access_key_name,
            "shared_access_key": shared_access_key,
        }
        return properties

    def _connection_string_builder(self, properties: dict) -> str:
        connection_string = "Endpoint=" + properties.get("endpoint") + ";"

        if properties.get("shared_access_key"):
            connection_string += (
                "SharedAccessKey=" + properties.get("shared_access_key") + ";"
            )

        if properties.get("shared_access_key_name"):
            connection_string += (
                "SharedAccessKeyName=" + properties.get("shared_access_key_name") + ";"
            )

        if properties.get("shared_access_signature"):
            connection_string += (
                "SharedAccessSignature="
                + properties.get("shared_access_signature")
                + ";"
            )
        return connection_string

    def _configure_options(self, options: dict) -> dict:
        if "subscribe" not in options:
            options["subscribe"] = self.connection_string_properties.get(
                "eventhub_name"
            )

        if "kafka.bootstrap.servers" not in options:
            options["kafka.bootstrap.servers"] = (
                self.connection_string_properties.get("fully_qualified_namespace")
                + ":9093"
            )

        if "kafka.sasl.mechanism" not in options:
            options["kafka.sasl.mechanism"] = "PLAIN"

        if "kafka.security.protocol" not in options:
            options["kafka.security.protocol"] = "SASL_SSL"

        if "kafka.sasl.jaas.config" not in options:
            kafka_package = "org.apache.kafka.common.security.plain.PlainLoginModule"
            if "DATABRICKS_RUNTIME_VERSION" in os.environ or (
                "_client" in self.spark.__dict__
                and "databricks" in self.spark.client.host
            ):
                kafka_package = "kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule"
            connection_string = self._connection_string_builder(
                self.connection_string_properties
            )
            options["kafka.sasl.jaas.config"] = (
                '{} required username="$ConnectionString" password="{}";'.format(
                    kafka_package, connection_string
                )
            )  # NOSONAR

        if "kafka.request.timeout.ms" not in options:
            options["kafka.request.timeout.ms"] = "60000"

        if "kafka.session.timeout.ms" not in options:
            options["kafka.session.timeout.ms"] = "60000"

        if "kafka.group.id" not in options:
            options["kafka.group.id"] = self.consumer_group

        options["includeHeaders"] = "true"

        return options

    def _transform_to_eventhub_schema(self, df: DataFrame) -> DataFrame:
        return (
            df.withColumn("headers", map_from_entries(col("headers")))
            .select(
                col("value").alias("body"),
                col("partition").cast("string"),
                col("offset").alias("sequenceNumber"),
                col("timestamp").alias("enqueuedTime"),
                (
                    decode_kafka_headers_to_amqp_properties(col("headers")).alias(
                        "properties"
                    )
                    if self.decode_kafka_headers_to_amqp_properties
                    else create_map().cast("map<string,string>").alias("properties")
                ),
            )
            .withColumn("offset", col("properties").getItem("x-opt-offset"))
            .withColumn("publisher", col("properties").getItem("x-opt-publisher"))
            .withColumn(
                "partitionKey", col("properties").getItem("x-opt-partition-key")
            )
            .withColumn(
                "systemProperties",
                map_filter(
                    col("properties"), lambda k, _: k.isin(eventhub_system_properties)
                ),
            )
            .withColumn(
                "properties",
                map_filter(
                    col("properties"), lambda k, _: ~k.isin(eventhub_system_properties)
                ),
            )
            .select(
                col("body"),
                col("partition"),
                col("offset"),
                col("sequenceNumber"),
                col("enqueuedTime"),
                col("publisher"),
                col("partitionKey"),
                col("properties"),
                col("systemProperties"),
            )
        )

    def read_batch(self) -> DataFrame:
        """
        Reads batch data from Kafka.
        """
        try:
            df = self.spark.read.format("kafka").options(**self.options).load()
            return self._transform_to_eventhub_schema(df)

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

    def read_stream(self) -> DataFrame:
        """
        Reads streaming data from Kafka.
        """
        try:
            df = self.spark.readStream.format("kafka").options(**self.options).load()
            return self._transform_to_eventhub_schema(df)

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/kafka_eventhub.py
191
192
193
194
195
196
197
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

read_batch()

Reads batch data from Kafka.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/kafka_eventhub.py
369
370
371
372
373
374
375
376
377
378
379
380
381
382
def read_batch(self) -> DataFrame:
    """
    Reads batch data from Kafka.
    """
    try:
        df = self.spark.read.format("kafka").options(**self.options).load()
        return self._transform_to_eventhub_schema(df)

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

read_stream()

Reads streaming data from Kafka.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/kafka_eventhub.py
384
385
386
387
388
389
390
391
392
393
394
395
396
397
def read_stream(self) -> DataFrame:
    """
    Reads streaming data from Kafka.
    """
    try:
        df = self.spark.readStream.format("kafka").options(**self.options).load()
        return self._transform_to_eventhub_schema(df)

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

SparkKinesisSource

Bases: SourceInterface

The Spark Kinesis Source is used to read data from Kinesis in a Databricks environment. Structured streaming from Kinesis is not supported in open source Spark.

Example

from rtdip_sdk.pipelines.sources import SparkKinesisSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

kinesis_source = SparkKinesisSource(
    spark=spark,
    options={
        "awsAccessKey": "{AWS-ACCESS-KEY}",
        "awsSecretKey": "{AWS-SECRET-KEY}",
        "streamName": "{STREAM-NAME}",
        "region": "{REGION}",
        "endpoint": "https://kinesis.{REGION}.amazonaws.com",
        "initialPosition": "earliest"
    }
)

kinesis_source.read_stream()

OR

kinesis_source.read_batch()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session required to read data from Kinesis

required
options dict

Options that can be specified for a Kinesis read operation (See Attributes table below). Further information on the options is available here

required

Attributes:

Name Type Description
awsAccessKey str

AWS access key.

awsSecretKey str

AWS secret access key corresponding to the access key.

streamName List[str]

The stream names to subscribe to.

region str

The region the streams are defined in.

endpoint str

The regional endpoint for Kinesis Data Streams.

initialPosition str

The point to start reading from; earliest, latest, or at_timestamp.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/kinesis.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
class SparkKinesisSource(SourceInterface):
    """
    The Spark Kinesis Source is used to read data from Kinesis in a Databricks environment.
    Structured streaming from Kinesis is **not** supported in open source Spark.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.sources import SparkKinesisSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    kinesis_source = SparkKinesisSource(
        spark=spark,
        options={
            "awsAccessKey": "{AWS-ACCESS-KEY}",
            "awsSecretKey": "{AWS-SECRET-KEY}",
            "streamName": "{STREAM-NAME}",
            "region": "{REGION}",
            "endpoint": "https://kinesis.{REGION}.amazonaws.com",
            "initialPosition": "earliest"
        }
    )

    kinesis_source.read_stream()

    OR

    kinesis_source.read_batch()
    ```

    Parameters:
        spark (SparkSession): Spark Session required to read data from Kinesis
        options (dict): Options that can be specified for a Kinesis read operation (See Attributes table below). Further information on the options is available [here](https://docs.databricks.com/structured-streaming/kinesis.html#configuration){ target="_blank" }

    Attributes:
        awsAccessKey (str): AWS access key.
        awsSecretKey (str): AWS secret access key corresponding to the access key.
        streamName (List[str]): The stream names to subscribe to.
        region (str): The region the streams are defined in.
        endpoint (str): The regional endpoint for Kinesis Data Streams.
        initialPosition (str): The point to start reading from; earliest, latest, or at_timestamp.
    """

    spark: SparkSession
    options: dict

    def __init__(self, spark: SparkSession, options: dict) -> None:
        self.spark = spark
        self.options = options
        self.schema = KINESIS_SCHEMA

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK_DATABRICKS
        """
        return SystemType.PYSPARK_DATABRICKS

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_read_validation(self):
        return True

    def post_read_validation(self, df: DataFrame) -> bool:
        assert df.schema == self.schema
        return True

    def read_batch(self):
        """
        Raises:
            NotImplementedError: Kinesis only supports streaming reads. To perform a batch read, use the read_stream method of this component and specify the Trigger on the write_stream to be `availableNow=True` to perform batch-like reads of cloud storage files.
        """
        raise NotImplementedError(
            "Kinesis only supports streaming reads. To perform a batch read, use the read_stream method and specify Trigger on the write_stream as `availableNow=True`"
        )

    def read_stream(self) -> DataFrame:
        """
        Reads streaming data from Kinesis. All of the data in the table is processed as well as any new data that arrives after the stream started.
        """
        try:
            return (
                self.spark.readStream.format("kinesis").options(**self.options).load()
            )
        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK_DATABRICKS

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/kinesis.py
77
78
79
80
81
82
83
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK_DATABRICKS
    """
    return SystemType.PYSPARK_DATABRICKS

read_batch()

Raises:

Type Description
NotImplementedError

Kinesis only supports streaming reads. To perform a batch read, use the read_stream method of this component and specify the Trigger on the write_stream to be availableNow=True to perform batch-like reads of cloud storage files.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/kinesis.py
101
102
103
104
105
106
107
108
def read_batch(self):
    """
    Raises:
        NotImplementedError: Kinesis only supports streaming reads. To perform a batch read, use the read_stream method of this component and specify the Trigger on the write_stream to be `availableNow=True` to perform batch-like reads of cloud storage files.
    """
    raise NotImplementedError(
        "Kinesis only supports streaming reads. To perform a batch read, use the read_stream method and specify Trigger on the write_stream as `availableNow=True`"
    )

read_stream()

Reads streaming data from Kinesis. All of the data in the table is processed as well as any new data that arrives after the stream started.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/kinesis.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def read_stream(self) -> DataFrame:
    """
    Reads streaming data from Kinesis. All of the data in the table is processed as well as any new data that arrives after the stream started.
    """
    try:
        return (
            self.spark.readStream.format("kinesis").options(**self.options).load()
        )
    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

BaseISOSource

Bases: SourceInterface

Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations

required
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class BaseISOSource(SourceInterface):
    """
    Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations
    """

    spark: SparkSession
    options: dict
    iso_url: str = "https://"
    query_datetime_format: str = "%Y%m%d"
    required_options: list = []
    spark_schema = StructType([StructField("id", IntegerType(), True)])
    default_query_timezone: str = "UTC"

    def __init__(self, spark: SparkSession, options: dict) -> None:
        self.spark = spark
        self.options = options
        self.query_timezone = pytz.timezone(
            self.options.get("query_timezone", self.default_query_timezone)
        )
        self.current_date = datetime.now(timezone.utc).astimezone(self.query_timezone)

    def _fetch_from_url(self, url_suffix: str) -> bytes:
        """
        Gets data from external ISO API.

        Args:
            url_suffix: String to be used as suffix to iso url.

        Returns:
            Raw content of the data received.

        """
        url = f"{self.iso_url}{url_suffix}"
        logging.info(f"Requesting URL - {url}")

        response = requests.get(url)
        code = response.status_code

        if code != 200:
            raise HTTPError(
                f"Unable to access URL `{url}`."
                f" Received status code {code} with message {response.content}"
            )

        return response.content

    def _get_localized_datetime(self, datetime_str: str) -> datetime:
        """
        Converts string datetime into Python datetime object with configured format and timezone.
        Args:
            datetime_str: String to be converted into datetime.

        Returns: Timezone aware datetime object.

        """
        parsed_dt = datetime.strptime(datetime_str, self.query_datetime_format)
        parsed_dt = parsed_dt.replace(tzinfo=self.query_timezone)
        return parsed_dt

    def _pull_data(self) -> pd.DataFrame:
        """
        Hits the fetch_from_url method with certain parameters to get raw data from API.

        All the children ISO classes must override this method and call the fetch_url method
        in it.

        Returns:
             Raw DataFrame from API.
        """

        return pd.read_csv(BytesIO(self._fetch_from_url("")))

    def _prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Performs all the basic transformations to prepare data for further processing.
        All the children ISO classes must override this method.

        Args:
            df: Raw DataFrame, received from the API.

        Returns:
             Modified DataFrame, ready for basic use.

        """
        return df

    def _sanitize_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Another data transformation helper method to be called after prepare data.
        Used for advance data processing such as cleaning, filtering, restructuring.
        All the children ISO classes must override this method if there is any post-processing required.

        Args:
            df: Initial modified version of DataFrame, received after preparing the data.

        Returns:
             Final version of data after all the fixes and modifications.

        """
        return df

    def _get_data(self) -> pd.DataFrame:
        """
        Entrypoint method to return the final version of DataFrame.

        Returns:
            Modified form of data for specific use case.

        """
        df = self._pull_data()
        df = self._prepare_data(df)
        df = self._sanitize_data(df)

        # Reorder columns to keep the data consistent
        df = df[self.spark_schema.names]

        return df

    @staticmethod
    def system_type():
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def _validate_options(self) -> bool:
        """
        Performs all the options checks. Raises exception in case of any invalid value.
        Returns:
             True if all checks are passed.

        """
        return True

    def pre_read_validation(self) -> bool:
        """
        Ensures all the required options are provided and performs other validations.
        Returns:
             True if all checks are passed.

        """
        for key in self.required_options:
            if key not in self.options:
                raise ValueError(f"Required option `{key}` is missing.")

        return self._validate_options()

    def post_read_validation(self) -> bool:
        return True

    def read_batch(self) -> DataFrame:
        """
        Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
        Returns:
             Final Spark DataFrame converted from Pandas DataFrame post-execution.

        """

        try:
            self.pre_read_validation()
            pdf = self._get_data()
            pdf = _prepare_pandas_to_convert_to_spark(pdf)

            # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
            pd.DataFrame.iteritems = pd.DataFrame.items
            df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
            return df

        except Exception as e:
            logging.exception(str(e))
            raise e

    def read_stream(self) -> DataFrame:
        """
        By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

        Returns:
             Final Spark DataFrame after all the processing.

        """

        raise NotImplementedError(
            f"{self.__class__.__name__} connector doesn't support stream operation."
        )

pre_read_validation()

Ensures all the required options are provided and performs other validations. Returns: True if all checks are passed.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
175
176
177
178
179
180
181
182
183
184
185
186
def pre_read_validation(self) -> bool:
    """
    Ensures all the required options are provided and performs other validations.
    Returns:
         True if all checks are passed.

    """
    for key in self.required_options:
        if key not in self.options:
            raise ValueError(f"Required option `{key}` is missing.")

    return self._validate_options()

read_batch()

Spark entrypoint, It executes the entire process of pulling, transforming & fixing data. Returns: Final Spark DataFrame converted from Pandas DataFrame post-execution.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def read_batch(self) -> DataFrame:
    """
    Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
    Returns:
         Final Spark DataFrame converted from Pandas DataFrame post-execution.

    """

    try:
        self.pre_read_validation()
        pdf = self._get_data()
        pdf = _prepare_pandas_to_convert_to_spark(pdf)

        # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
        pd.DataFrame.iteritems = pd.DataFrame.items
        df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
        return df

    except Exception as e:
        logging.exception(str(e))
        raise e

read_stream()

By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

Returns:

Type Description
DataFrame

Final Spark DataFrame after all the processing.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
213
214
215
216
217
218
219
220
221
222
223
224
def read_stream(self) -> DataFrame:
    """
    By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

    Returns:
         Final Spark DataFrame after all the processing.

    """

    raise NotImplementedError(
        f"{self.__class__.__name__} connector doesn't support stream operation."
    )

ERCOTDailyLoadISOSource

Bases: BaseISOSource

The ERCOT Daily Load ISO Source is used to read daily load data from ERCOT using WebScrapping. It supports actual and forecast data. To read more about the reports, visit the following URLs (The urls are only accessible if the requester/client is in US)-

For load type actual: Actual System Load by Weather Zone
For load type forecast: Seven-Day Load Forecast by Weather Zone

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations (See Attributes table below)

required

Attributes:

Name Type Description
load_type list

Must be one of actual or forecast.

date str

Must be in YYYY-MM-DD format.

certificate_pfx_key str

The certificate key data or password received from ERCOT.

certificate_pfx_key_contents str

The certificate data received from ERCOT, it could be base64 encoded.

Please check the BaseISOSource for available methods.

BaseISOSource

BaseISOSource

Bases: SourceInterface

Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations

required
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class BaseISOSource(SourceInterface):
    """
    Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations
    """

    spark: SparkSession
    options: dict
    iso_url: str = "https://"
    query_datetime_format: str = "%Y%m%d"
    required_options: list = []
    spark_schema = StructType([StructField("id", IntegerType(), True)])
    default_query_timezone: str = "UTC"

    def __init__(self, spark: SparkSession, options: dict) -> None:
        self.spark = spark
        self.options = options
        self.query_timezone = pytz.timezone(
            self.options.get("query_timezone", self.default_query_timezone)
        )
        self.current_date = datetime.now(timezone.utc).astimezone(self.query_timezone)

    def _fetch_from_url(self, url_suffix: str) -> bytes:
        """
        Gets data from external ISO API.

        Args:
            url_suffix: String to be used as suffix to iso url.

        Returns:
            Raw content of the data received.

        """
        url = f"{self.iso_url}{url_suffix}"
        logging.info(f"Requesting URL - {url}")

        response = requests.get(url)
        code = response.status_code

        if code != 200:
            raise HTTPError(
                f"Unable to access URL `{url}`."
                f" Received status code {code} with message {response.content}"
            )

        return response.content

    def _get_localized_datetime(self, datetime_str: str) -> datetime:
        """
        Converts string datetime into Python datetime object with configured format and timezone.
        Args:
            datetime_str: String to be converted into datetime.

        Returns: Timezone aware datetime object.

        """
        parsed_dt = datetime.strptime(datetime_str, self.query_datetime_format)
        parsed_dt = parsed_dt.replace(tzinfo=self.query_timezone)
        return parsed_dt

    def _pull_data(self) -> pd.DataFrame:
        """
        Hits the fetch_from_url method with certain parameters to get raw data from API.

        All the children ISO classes must override this method and call the fetch_url method
        in it.

        Returns:
             Raw DataFrame from API.
        """

        return pd.read_csv(BytesIO(self._fetch_from_url("")))

    def _prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Performs all the basic transformations to prepare data for further processing.
        All the children ISO classes must override this method.

        Args:
            df: Raw DataFrame, received from the API.

        Returns:
             Modified DataFrame, ready for basic use.

        """
        return df

    def _sanitize_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Another data transformation helper method to be called after prepare data.
        Used for advance data processing such as cleaning, filtering, restructuring.
        All the children ISO classes must override this method if there is any post-processing required.

        Args:
            df: Initial modified version of DataFrame, received after preparing the data.

        Returns:
             Final version of data after all the fixes and modifications.

        """
        return df

    def _get_data(self) -> pd.DataFrame:
        """
        Entrypoint method to return the final version of DataFrame.

        Returns:
            Modified form of data for specific use case.

        """
        df = self._pull_data()
        df = self._prepare_data(df)
        df = self._sanitize_data(df)

        # Reorder columns to keep the data consistent
        df = df[self.spark_schema.names]

        return df

    @staticmethod
    def system_type():
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def _validate_options(self) -> bool:
        """
        Performs all the options checks. Raises exception in case of any invalid value.
        Returns:
             True if all checks are passed.

        """
        return True

    def pre_read_validation(self) -> bool:
        """
        Ensures all the required options are provided and performs other validations.
        Returns:
             True if all checks are passed.

        """
        for key in self.required_options:
            if key not in self.options:
                raise ValueError(f"Required option `{key}` is missing.")

        return self._validate_options()

    def post_read_validation(self) -> bool:
        return True

    def read_batch(self) -> DataFrame:
        """
        Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
        Returns:
             Final Spark DataFrame converted from Pandas DataFrame post-execution.

        """

        try:
            self.pre_read_validation()
            pdf = self._get_data()
            pdf = _prepare_pandas_to_convert_to_spark(pdf)

            # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
            pd.DataFrame.iteritems = pd.DataFrame.items
            df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
            return df

        except Exception as e:
            logging.exception(str(e))
            raise e

    def read_stream(self) -> DataFrame:
        """
        By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

        Returns:
             Final Spark DataFrame after all the processing.

        """

        raise NotImplementedError(
            f"{self.__class__.__name__} connector doesn't support stream operation."
        )

pre_read_validation()

Ensures all the required options are provided and performs other validations. Returns: True if all checks are passed.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
175
176
177
178
179
180
181
182
183
184
185
186
def pre_read_validation(self) -> bool:
    """
    Ensures all the required options are provided and performs other validations.
    Returns:
         True if all checks are passed.

    """
    for key in self.required_options:
        if key not in self.options:
            raise ValueError(f"Required option `{key}` is missing.")

    return self._validate_options()

read_batch()

Spark entrypoint, It executes the entire process of pulling, transforming & fixing data. Returns: Final Spark DataFrame converted from Pandas DataFrame post-execution.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def read_batch(self) -> DataFrame:
    """
    Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
    Returns:
         Final Spark DataFrame converted from Pandas DataFrame post-execution.

    """

    try:
        self.pre_read_validation()
        pdf = self._get_data()
        pdf = _prepare_pandas_to_convert_to_spark(pdf)

        # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
        pd.DataFrame.iteritems = pd.DataFrame.items
        df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
        return df

    except Exception as e:
        logging.exception(str(e))
        raise e

read_stream()

By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

Returns:

Type Description
DataFrame

Final Spark DataFrame after all the processing.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
213
214
215
216
217
218
219
220
221
222
223
224
def read_stream(self) -> DataFrame:
    """
    By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

    Returns:
         Final Spark DataFrame after all the processing.

    """

    raise NotImplementedError(
        f"{self.__class__.__name__} connector doesn't support stream operation."
    )
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/ercot_daily_load_iso.py
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class ERCOTDailyLoadISOSource(BaseISOSource):
    """
    The ERCOT Daily Load ISO Source is used to read daily load data from ERCOT using WebScrapping.
    It supports actual and forecast data. To read more about the reports, visit the following URLs
    (The urls are only accessible if the requester/client is in US)-

    For load type `actual`: [Actual System Load by Weather Zone](https://www.ercot.com/mp/data-products/
    data-product-details?id=NP6-345-CD)
    <br>
    For load type `forecast`: [Seven-Day Load Forecast by Weather Zone](https://www.ercot.com/mp/data-products/
    data-product-details?id=NP3-561-CD)


    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations (See Attributes table below)

    Attributes:
        load_type (list): Must be one of `actual` or `forecast`.
        date (str): Must be in `YYYY-MM-DD` format.
        certificate_pfx_key (str): The certificate key data or password received from ERCOT.
        certificate_pfx_key_contents (str): The certificate data received from ERCOT, it could be base64 encoded.

    Please check the BaseISOSource for available methods.

    BaseISOSource:
        ::: src.sdk.python.rtdip_sdk.pipelines.sources.spark.iso.base_iso
    """

    spark: SparkSession
    options: dict
    url_forecast: str = "https://mis.ercot.com/misapp/GetReports.do?reportTypeId=12312"
    url_actual: str = "https://mis.ercot.com/misapp/GetReports.do?reportTypeId=13101"
    url_prefix: str = "https://mis.ercot.com"
    query_datetime_format: str = "%Y-%m-%d"
    required_options = [
        "load_type",
        "date",
        "certificate_pfx_key",
        "certificate_pfx_key_contents",
    ]
    spark_schema = ERCOT_SCHEMA
    default_query_timezone = "UTC"

    def __init__(self, spark: SparkSession, options: dict) -> None:
        super().__init__(spark, options)
        self.spark = spark
        self.options = options
        self.load_type = self.options.get("load_type", "actual")
        self.date = self.options.get("date", "").strip()
        self.certificate_pfx_key = self.options.get("certificate_pfx_key", "").strip()
        self.certificate_pfx_key_contents = self.options.get(
            "certificate_pfx_key_contents", ""
        ).strip()

    def generate_temp_client_cert_files_from_pfx(self):
        password = self.certificate_pfx_key.encode()
        pfx: bytes = base64.b64decode(self.certificate_pfx_key_contents)

        if base64.b64encode(pfx) != self.certificate_pfx_key_contents.encode():
            pfx = self.certificate_pfx_key_contents

        key, cert, _ = pkcs12.load_key_and_certificates(data=pfx, password=password)
        key_bytes = key.private_bytes(
            encoding=serialization.Encoding.PEM,
            format=serialization.PrivateFormat.TraditionalOpenSSL,
            encryption_algorithm=serialization.NoEncryption(),
        )

        cert_bytes = cert.public_bytes(encoding=serialization.Encoding.PEM)
        return TempCertFiles(cert_bytes, key_bytes)

    def _pull_data(self) -> pd.DataFrame:
        """
        Pulls data from the ERCOT API and parses the zip files for CSV data.

        Returns:
            Raw form of data.
        """

        logging.info(f"Getting {self.load_type} data for date {self.date}")
        url = self.url_forecast
        req_date = datetime.strptime(self.date, self.query_datetime_format)

        if self.load_type == "actual":
            req_date = req_date + timedelta(days=1)
            url = self.url_actual

        url_lists, files = self.generate_urls_for_zip(url, req_date)
        dfs = []
        logging.info(f"Generated {len(url_lists)} URLs - {url_lists}")
        logging.info(f"Requesting files - {files}")

        for url in url_lists:
            df = self.download_zip(url)
            dfs.append(df)
        final_df = pd.concat(dfs)
        return final_df

    def download_zip(self, url) -> pd.DataFrame:
        logging.info(f"Downloading zip using {url}")
        with self.generate_temp_client_cert_files_from_pfx() as cert:
            response = requests.get(url, cert=cert)

        if not response.content:
            raise HTTPError("Empty Response was returned")

        logging.info("Unzipping the file")
        zf = ZipFile(BytesIO(response.content))
        csvs = [s for s in zf.namelist() if ".csv" in s]

        if len(csvs) == 0:
            raise ValueError("No data was found in the specified interval")

        df = pd.read_csv(zf.open(csvs[0]))
        return df

    def generate_urls_for_zip(self, url: str, date: datetime) -> (List[str], List[str]):
        logging.info(f"Finding urls list for date {date}")
        with self.generate_temp_client_cert_files_from_pfx() as cert:
            page_response = requests.get(url, timeout=5, cert=cert)

        page_content = BeautifulSoup(page_response.content, "html.parser")
        zip_info = []
        length = len(page_content.find_all("td", {"class": "labelOptional_ind"}))

        for i in range(0, length):
            zip_name = page_content.find_all("td", {"class": "labelOptional_ind"})[
                i
            ].text
            zip_link = page_content.find_all("a")[i].get("href")
            zip_info.append((zip_name, zip_link))

        date_str = date.strftime("%Y%m%d")
        zip_info = list(
            filter(
                lambda f_info: f_info[0].endswith("csv.zip") and date_str in f_info[0],
                zip_info,
            )
        )

        urls = []
        files = []

        if len(zip_info) == 0:
            raise ValueError(f"No file was found for date - {date_str}")

        # As Forecast is generated every hour, pick the latest one.
        zip_info = sorted(zip_info, key=lambda item: item[0], reverse=True)
        zip_info_item = zip_info[0]

        file_name, file_url = zip_info_item
        urls.append(self.url_prefix + file_url)
        files.append(file_name)

        return urls, files

    def _prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        if self.load_type == "actual":
            df["Date"] = pd.to_datetime(df["OperDay"], format="%m/%d/%Y")

            df = df.rename(
                columns={
                    "COAST": "Coast",
                    "EAST": "East",
                    "FAR_WEST": "FarWest",
                    "NORTH": "North",
                    "NORTH_C": "NorthCentral",
                    "SOUTH_C": "SouthCentral",
                    "SOUTHERN": "Southern",
                    "WEST": "West",
                    "TOTAL": "SystemTotal",
                    "DSTFlag": "DstFlag",
                }
            )

        else:
            df = df.rename(columns={"DSTFlag": "DstFlag"})

            df["Date"] = pd.to_datetime(df["DeliveryDate"], format="%m/%d/%Y")

        return df

    def _validate_options(self) -> bool:
        try:
            datetime.strptime(self.date, self.query_datetime_format)
        except ValueError:
            raise ValueError(
                f"Unable to parse date. Please specify in {self.query_datetime_format} format."
            )
        return True

MISODailyLoadISOSource

Bases: BaseISOSource

The MISO Daily Load ISO Source is used to read daily load data from MISO API. It supports both Actual and Forecast data.

To read more about the available reports from MISO API, download the file - Market Reports

From the list of reports in the file, it pulls the report named Daily Forecast and Actual Load by Local Resource Zone.

Actual data is available for one day minus from the given date.

Forecast data is available for next 6 day (inclusive of given date).

Example

from rtdip_sdk.pipelines.sources import MISODailyLoadISOSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

miso_source = MISODailyLoadISOSource(
    spark=spark,
    options={
        "load_type": "actual",
        "date": "20230520",
    }
)

miso_source.read_batch()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations (See Attributes table below)

required

Attributes:

Name Type Description
load_type str

Must be one of actual or forecast

date str

Must be in YYYYMMDD format.

Please check the BaseISOSource for available methods.

BaseISOSource

BaseISOSource

Bases: SourceInterface

Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations

required
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class BaseISOSource(SourceInterface):
    """
    Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations
    """

    spark: SparkSession
    options: dict
    iso_url: str = "https://"
    query_datetime_format: str = "%Y%m%d"
    required_options: list = []
    spark_schema = StructType([StructField("id", IntegerType(), True)])
    default_query_timezone: str = "UTC"

    def __init__(self, spark: SparkSession, options: dict) -> None:
        self.spark = spark
        self.options = options
        self.query_timezone = pytz.timezone(
            self.options.get("query_timezone", self.default_query_timezone)
        )
        self.current_date = datetime.now(timezone.utc).astimezone(self.query_timezone)

    def _fetch_from_url(self, url_suffix: str) -> bytes:
        """
        Gets data from external ISO API.

        Args:
            url_suffix: String to be used as suffix to iso url.

        Returns:
            Raw content of the data received.

        """
        url = f"{self.iso_url}{url_suffix}"
        logging.info(f"Requesting URL - {url}")

        response = requests.get(url)
        code = response.status_code

        if code != 200:
            raise HTTPError(
                f"Unable to access URL `{url}`."
                f" Received status code {code} with message {response.content}"
            )

        return response.content

    def _get_localized_datetime(self, datetime_str: str) -> datetime:
        """
        Converts string datetime into Python datetime object with configured format and timezone.
        Args:
            datetime_str: String to be converted into datetime.

        Returns: Timezone aware datetime object.

        """
        parsed_dt = datetime.strptime(datetime_str, self.query_datetime_format)
        parsed_dt = parsed_dt.replace(tzinfo=self.query_timezone)
        return parsed_dt

    def _pull_data(self) -> pd.DataFrame:
        """
        Hits the fetch_from_url method with certain parameters to get raw data from API.

        All the children ISO classes must override this method and call the fetch_url method
        in it.

        Returns:
             Raw DataFrame from API.
        """

        return pd.read_csv(BytesIO(self._fetch_from_url("")))

    def _prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Performs all the basic transformations to prepare data for further processing.
        All the children ISO classes must override this method.

        Args:
            df: Raw DataFrame, received from the API.

        Returns:
             Modified DataFrame, ready for basic use.

        """
        return df

    def _sanitize_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Another data transformation helper method to be called after prepare data.
        Used for advance data processing such as cleaning, filtering, restructuring.
        All the children ISO classes must override this method if there is any post-processing required.

        Args:
            df: Initial modified version of DataFrame, received after preparing the data.

        Returns:
             Final version of data after all the fixes and modifications.

        """
        return df

    def _get_data(self) -> pd.DataFrame:
        """
        Entrypoint method to return the final version of DataFrame.

        Returns:
            Modified form of data for specific use case.

        """
        df = self._pull_data()
        df = self._prepare_data(df)
        df = self._sanitize_data(df)

        # Reorder columns to keep the data consistent
        df = df[self.spark_schema.names]

        return df

    @staticmethod
    def system_type():
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def _validate_options(self) -> bool:
        """
        Performs all the options checks. Raises exception in case of any invalid value.
        Returns:
             True if all checks are passed.

        """
        return True

    def pre_read_validation(self) -> bool:
        """
        Ensures all the required options are provided and performs other validations.
        Returns:
             True if all checks are passed.

        """
        for key in self.required_options:
            if key not in self.options:
                raise ValueError(f"Required option `{key}` is missing.")

        return self._validate_options()

    def post_read_validation(self) -> bool:
        return True

    def read_batch(self) -> DataFrame:
        """
        Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
        Returns:
             Final Spark DataFrame converted from Pandas DataFrame post-execution.

        """

        try:
            self.pre_read_validation()
            pdf = self._get_data()
            pdf = _prepare_pandas_to_convert_to_spark(pdf)

            # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
            pd.DataFrame.iteritems = pd.DataFrame.items
            df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
            return df

        except Exception as e:
            logging.exception(str(e))
            raise e

    def read_stream(self) -> DataFrame:
        """
        By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

        Returns:
             Final Spark DataFrame after all the processing.

        """

        raise NotImplementedError(
            f"{self.__class__.__name__} connector doesn't support stream operation."
        )

pre_read_validation()

Ensures all the required options are provided and performs other validations. Returns: True if all checks are passed.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
175
176
177
178
179
180
181
182
183
184
185
186
def pre_read_validation(self) -> bool:
    """
    Ensures all the required options are provided and performs other validations.
    Returns:
         True if all checks are passed.

    """
    for key in self.required_options:
        if key not in self.options:
            raise ValueError(f"Required option `{key}` is missing.")

    return self._validate_options()

read_batch()

Spark entrypoint, It executes the entire process of pulling, transforming & fixing data. Returns: Final Spark DataFrame converted from Pandas DataFrame post-execution.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def read_batch(self) -> DataFrame:
    """
    Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
    Returns:
         Final Spark DataFrame converted from Pandas DataFrame post-execution.

    """

    try:
        self.pre_read_validation()
        pdf = self._get_data()
        pdf = _prepare_pandas_to_convert_to_spark(pdf)

        # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
        pd.DataFrame.iteritems = pd.DataFrame.items
        df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
        return df

    except Exception as e:
        logging.exception(str(e))
        raise e

read_stream()

By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

Returns:

Type Description
DataFrame

Final Spark DataFrame after all the processing.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
213
214
215
216
217
218
219
220
221
222
223
224
def read_stream(self) -> DataFrame:
    """
    By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

    Returns:
         Final Spark DataFrame after all the processing.

    """

    raise NotImplementedError(
        f"{self.__class__.__name__} connector doesn't support stream operation."
    )
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/miso_daily_load_iso.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
class MISODailyLoadISOSource(BaseISOSource):
    """
    The MISO Daily Load ISO Source is used to read daily load data from MISO API. It supports both Actual and Forecast data.

    To read more about the available reports from MISO API, download the file -
    [Market Reports](https://cdn.misoenergy.org/Market%20Reports%20Directory115139.xlsx)

    From the list of reports in the file, it pulls the report named
    `Daily Forecast and Actual Load by Local Resource Zone`.

    Actual data is available for one day minus from the given date.

    Forecast data is available for next 6 day (inclusive of given date).


    Example
    --------
    ```python
    from rtdip_sdk.pipelines.sources import MISODailyLoadISOSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    miso_source = MISODailyLoadISOSource(
        spark=spark,
        options={
            "load_type": "actual",
            "date": "20230520",
        }
    )

    miso_source.read_batch()
    ```

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations (See Attributes table below)

    Attributes:
        load_type (str): Must be one of `actual` or `forecast`
        date (str): Must be in `YYYYMMDD` format.

    Please check the BaseISOSource for available methods.

    BaseISOSource:
        ::: src.sdk.python.rtdip_sdk.pipelines.sources.spark.iso.base_iso
    """

    spark: SparkSession
    options: dict
    iso_url: str = "https://docs.misoenergy.org/marketreports/"
    query_datetime_format: str = "%Y%m%d"
    required_options = ["load_type", "date"]
    spark_schema = MISO_SCHEMA
    default_query_timezone = "US/Central"

    def __init__(self, spark: SparkSession, options: dict) -> None:
        super().__init__(spark, options)
        self.spark = spark
        self.options = options
        self.load_type = self.options.get("load_type", "actual")
        self.date = self.options.get("date", "").strip()

    def _pull_data(self) -> pd.DataFrame:
        """
        Pulls data from the MISO API and parses the Excel file.

        Returns:
            Raw form of data.
        """

        logging.info(f"Getting {self.load_type} data for date {self.date}")
        df = pd.read_excel(self._fetch_from_url(f"{self.date}_df_al.xls"), skiprows=4)

        return df

    def _prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Creates a new `date_time` column and removes null values.

        Args:
            df: Raw form of data received from the API.

        Returns:
            Data after basic transformations.

        """

        df.drop(
            df.index[(df["HourEnding"] == "HourEnding") | df["MISO MTLF (MWh)"].isna()],
            inplace=True,
        )
        df.rename(columns={"Market Day": "date"}, inplace=True)

        df["date_time"] = pd.to_datetime(df["date"]) + pd.to_timedelta(
            df["HourEnding"].astype(int) - 1, "h"
        )
        df.drop(["HourEnding", "date"], axis=1, inplace=True)

        data_cols = df.columns[df.columns != "date_time"]
        df[data_cols] = df[data_cols].astype(float)

        df.reset_index(inplace=True, drop=True)

        return df

    def _sanitize_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Filter outs Actual or Forecast data based on `load_type`.
        Args:
            df: Data received after preparation.

        Returns:
            Final data either containing Actual or Forecast values.

        """

        skip_col_suffix = ""

        if self.load_type == "actual":
            skip_col_suffix = "MTLF (MWh)"

        elif self.load_type == "forecast":
            skip_col_suffix = "ActualLoad (MWh)"

        df = df[[x for x in df.columns if not x.endswith(skip_col_suffix)]]
        df = df.dropna()
        df.columns = [str(x.split(" ")[0]).upper() for x in df.columns]

        rename_cols = {
            "LRZ1": "Lrz1",
            "LRZ2_7": "Lrz2_7",
            "LRZ3_5": "Lrz3_5",
            "LRZ4": "Lrz4",
            "LRZ6": "Lrz6",
            "LRZ8_9_10": "Lrz8_9_10",
            "MISO": "Miso",
            "DATE_TIME": "Datetime",
        }

        df = df.rename(columns=rename_cols)

        return df

    def _validate_options(self) -> bool:
        """
        Validates the following options:
            - `date` must be in the correct format.
            - `load_type` must be valid.

        Returns:
            True if all looks good otherwise raises Exception.

        """

        try:
            date = self._get_localized_datetime(self.date)
        except ValueError:
            raise ValueError("Unable to parse Date. Please specify in YYYYMMDD format.")

        if date > self.current_date:
            raise ValueError("Query date can't be in future.")

        valid_load_types = ["actual", "forecast"]

        if self.load_type not in valid_load_types:
            raise ValueError(
                f"Invalid load_type `{self.load_type}` given. Supported values are {valid_load_types}."
            )

        return True

MISOHistoricalLoadISOSource

Bases: MISODailyLoadISOSource

The MISO Historical Load ISO Source is used to read historical load data from MISO API.

To read more about the available reports from MISO API, download the file - Market Reports

From the list of reports in the file, it pulls the report named Historical Daily Forecast and Actual Load by Local Resource Zone.

Example

from rtdip_sdk.pipelines.sources import MISOHistoricalLoadISOSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

miso_source = MISOHistoricalLoadISOSource(
    spark=spark,
    options={
        "start_date": "20230510",
        "end_date": "20230520",
    }
)

miso_source.read_batch()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations (See Attributes table below)

required

Attributes:

Name Type Description
start_date str

Must be in YYYYMMDD format.

end_date str

Must be in YYYYMMDD format.

fill_missing str

Set to "true" to fill missing Actual load with Forecast load. Default - true.

Please check the BaseISOSource for available methods.

BaseISOSource

BaseISOSource

Bases: SourceInterface

Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations

required
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class BaseISOSource(SourceInterface):
    """
    Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations
    """

    spark: SparkSession
    options: dict
    iso_url: str = "https://"
    query_datetime_format: str = "%Y%m%d"
    required_options: list = []
    spark_schema = StructType([StructField("id", IntegerType(), True)])
    default_query_timezone: str = "UTC"

    def __init__(self, spark: SparkSession, options: dict) -> None:
        self.spark = spark
        self.options = options
        self.query_timezone = pytz.timezone(
            self.options.get("query_timezone", self.default_query_timezone)
        )
        self.current_date = datetime.now(timezone.utc).astimezone(self.query_timezone)

    def _fetch_from_url(self, url_suffix: str) -> bytes:
        """
        Gets data from external ISO API.

        Args:
            url_suffix: String to be used as suffix to iso url.

        Returns:
            Raw content of the data received.

        """
        url = f"{self.iso_url}{url_suffix}"
        logging.info(f"Requesting URL - {url}")

        response = requests.get(url)
        code = response.status_code

        if code != 200:
            raise HTTPError(
                f"Unable to access URL `{url}`."
                f" Received status code {code} with message {response.content}"
            )

        return response.content

    def _get_localized_datetime(self, datetime_str: str) -> datetime:
        """
        Converts string datetime into Python datetime object with configured format and timezone.
        Args:
            datetime_str: String to be converted into datetime.

        Returns: Timezone aware datetime object.

        """
        parsed_dt = datetime.strptime(datetime_str, self.query_datetime_format)
        parsed_dt = parsed_dt.replace(tzinfo=self.query_timezone)
        return parsed_dt

    def _pull_data(self) -> pd.DataFrame:
        """
        Hits the fetch_from_url method with certain parameters to get raw data from API.

        All the children ISO classes must override this method and call the fetch_url method
        in it.

        Returns:
             Raw DataFrame from API.
        """

        return pd.read_csv(BytesIO(self._fetch_from_url("")))

    def _prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Performs all the basic transformations to prepare data for further processing.
        All the children ISO classes must override this method.

        Args:
            df: Raw DataFrame, received from the API.

        Returns:
             Modified DataFrame, ready for basic use.

        """
        return df

    def _sanitize_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Another data transformation helper method to be called after prepare data.
        Used for advance data processing such as cleaning, filtering, restructuring.
        All the children ISO classes must override this method if there is any post-processing required.

        Args:
            df: Initial modified version of DataFrame, received after preparing the data.

        Returns:
             Final version of data after all the fixes and modifications.

        """
        return df

    def _get_data(self) -> pd.DataFrame:
        """
        Entrypoint method to return the final version of DataFrame.

        Returns:
            Modified form of data for specific use case.

        """
        df = self._pull_data()
        df = self._prepare_data(df)
        df = self._sanitize_data(df)

        # Reorder columns to keep the data consistent
        df = df[self.spark_schema.names]

        return df

    @staticmethod
    def system_type():
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def _validate_options(self) -> bool:
        """
        Performs all the options checks. Raises exception in case of any invalid value.
        Returns:
             True if all checks are passed.

        """
        return True

    def pre_read_validation(self) -> bool:
        """
        Ensures all the required options are provided and performs other validations.
        Returns:
             True if all checks are passed.

        """
        for key in self.required_options:
            if key not in self.options:
                raise ValueError(f"Required option `{key}` is missing.")

        return self._validate_options()

    def post_read_validation(self) -> bool:
        return True

    def read_batch(self) -> DataFrame:
        """
        Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
        Returns:
             Final Spark DataFrame converted from Pandas DataFrame post-execution.

        """

        try:
            self.pre_read_validation()
            pdf = self._get_data()
            pdf = _prepare_pandas_to_convert_to_spark(pdf)

            # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
            pd.DataFrame.iteritems = pd.DataFrame.items
            df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
            return df

        except Exception as e:
            logging.exception(str(e))
            raise e

    def read_stream(self) -> DataFrame:
        """
        By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

        Returns:
             Final Spark DataFrame after all the processing.

        """

        raise NotImplementedError(
            f"{self.__class__.__name__} connector doesn't support stream operation."
        )

pre_read_validation()

Ensures all the required options are provided and performs other validations. Returns: True if all checks are passed.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
175
176
177
178
179
180
181
182
183
184
185
186
def pre_read_validation(self) -> bool:
    """
    Ensures all the required options are provided and performs other validations.
    Returns:
         True if all checks are passed.

    """
    for key in self.required_options:
        if key not in self.options:
            raise ValueError(f"Required option `{key}` is missing.")

    return self._validate_options()

read_batch()

Spark entrypoint, It executes the entire process of pulling, transforming & fixing data. Returns: Final Spark DataFrame converted from Pandas DataFrame post-execution.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def read_batch(self) -> DataFrame:
    """
    Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
    Returns:
         Final Spark DataFrame converted from Pandas DataFrame post-execution.

    """

    try:
        self.pre_read_validation()
        pdf = self._get_data()
        pdf = _prepare_pandas_to_convert_to_spark(pdf)

        # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
        pd.DataFrame.iteritems = pd.DataFrame.items
        df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
        return df

    except Exception as e:
        logging.exception(str(e))
        raise e

read_stream()

By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

Returns:

Type Description
DataFrame

Final Spark DataFrame after all the processing.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
213
214
215
216
217
218
219
220
221
222
223
224
def read_stream(self) -> DataFrame:
    """
    By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

    Returns:
         Final Spark DataFrame after all the processing.

    """

    raise NotImplementedError(
        f"{self.__class__.__name__} connector doesn't support stream operation."
    )
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/miso_historical_load_iso.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
class MISOHistoricalLoadISOSource(MISODailyLoadISOSource):
    """
    The MISO Historical Load ISO Source is used to read historical load data from MISO API.

    To read more about the available reports from MISO API, download the file -
     [Market Reports](https://cdn.misoenergy.org/Market%20Reports%20Directory115139.xlsx)

    From the list of reports in the file, it pulls the report named
     `Historical Daily Forecast and Actual Load by Local Resource Zone`.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.sources import MISOHistoricalLoadISOSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    miso_source = MISOHistoricalLoadISOSource(
        spark=spark,
        options={
            "start_date": "20230510",
            "end_date": "20230520",
        }
    )

    miso_source.read_batch()
    ```

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations (See Attributes table below)

    Attributes:
        start_date (str): Must be in `YYYYMMDD` format.
        end_date (str): Must be in `YYYYMMDD` format.
        fill_missing (str): Set to `"true"` to fill missing Actual load with Forecast load. Default - `true`.

    Please check the BaseISOSource for available methods.

    BaseISOSource:
        ::: src.sdk.python.rtdip_sdk.pipelines.sources.spark.iso.base_iso
    """

    spark: SparkSession
    options: dict
    required_options = ["start_date", "end_date"]

    def __init__(self, spark: SparkSession, options: dict):
        super().__init__(spark, options)
        self.start_date = self.options.get("start_date", "")
        self.end_date = self.options.get("end_date", "")
        self.fill_missing = bool(self.options.get("fill_missing", "true") == "true")

    def _get_historical_data_for_date(self, date: datetime) -> pd.DataFrame:
        logging.info(f"Getting historical data for date {date}")
        df = pd.read_excel(
            self._fetch_from_url(
                f"{date.strftime(self.query_datetime_format)}_dfal_HIST.xls"
            ),
            skiprows=5,
        )

        if date.month == 12 and date.day == 31:
            expected_year_rows = (
                pd.Timestamp(date.year, 12, 31).dayofyear * 24 * 7
            )  # Every hour has 7 zones.
            received_year_rows = (
                len(df[df["MarketDay"] != "MarketDay"]) - 2
            )  # Last 2 rows are invalid.

            if expected_year_rows != received_year_rows:
                logging.warning(
                    f"Didn't receive full year historical data for year {date.year}."
                    f" Expected {expected_year_rows} but Received {received_year_rows}"
                )

        return df

    def _pull_data(self) -> pd.DataFrame:
        """
        Pulls data from the MISO API and parses the Excel file.

        Returns:
            Raw form of data.
        """

        logging.info(
            f"Historical load requested from {self.start_date} to {self.end_date}"
        )

        start_date = self._get_localized_datetime(self.start_date)
        end_date = self._get_localized_datetime(self.end_date)

        dates = pd.date_range(
            start_date, end_date + timedelta(days=365), freq="Y", inclusive="left"
        )
        logging.info(f"Generated date ranges are - {dates}")

        # Collect all historical data on yearly basis.
        df = pd.concat(
            [
                self._get_historical_data_for_date(min(date, self.current_date))
                for date in dates
            ],
            sort=False,
        )

        return df

    def _prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Creates a new `Datetime` column, removes null values and pivots the data.

        Args:
            df: Raw form of data received from the API.

        Returns:
            Data after basic transformations and pivoting.

        """

        df = df[df["MarketDay"] != "MarketDay"]

        # Fill missing actual values with the forecast values to avoid gaps.
        if self.fill_missing:
            df = df.fillna({"ActualLoad (MWh)": df["MTLF (MWh)"]})

        df = df.rename(
            columns={
                "MarketDay": "date",
                "HourEnding": "hour",
                "ActualLoad (MWh)": "load",
                "LoadResource Zone": "zone",
            }
        )
        df = df.dropna()

        df["date_time"] = pd.to_datetime(df["date"]) + pd.to_timedelta(
            df["hour"].astype(int) - 1, "h"
        )

        df.drop(["hour", "date"], axis=1, inplace=True)
        df["load"] = df["load"].astype(float)

        df = df.pivot_table(
            index="date_time", values="load", columns="zone"
        ).reset_index()

        df.columns = [str(x.split(" ")[0]).upper() for x in df.columns]

        rename_cols = {
            "LRZ1": "Lrz1",
            "LRZ2_7": "Lrz2_7",
            "LRZ3_5": "Lrz3_5",
            "LRZ4": "Lrz4",
            "LRZ6": "Lrz6",
            "LRZ8_9_10": "Lrz8_9_10",
            "MISO": "Miso",
            "DATE_TIME": "Datetime",
        }

        df = df.rename(columns=rename_cols)

        return df

    def _sanitize_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Filter outs data outside the requested date range.

        Args:
            df: Data received after preparation.

        Returns:
            Final data after all the transformations.

        """

        start_date = self._get_localized_datetime(self.start_date)
        end_date = self._get_localized_datetime(self.end_date).replace(
            hour=23, minute=59, second=59
        )

        df = df[
            (df["Datetime"] >= start_date.replace(tzinfo=None))
            & (df["Datetime"] <= end_date.replace(tzinfo=None))
        ]

        df = df.sort_values(by="Datetime", ascending=True).reset_index(drop=True)

        expected_rows = ((min(end_date, self.current_date) - start_date).days + 1) * 24

        actual_rows = len(df)

        logging.info(f"Rows Expected = {expected_rows}, Rows Found = {actual_rows}")

        return df

    def _validate_options(self) -> bool:
        """
        Validates the following options:
            - `start_date` & `end_data` must be in the correct format.
            - `start_date` must be behind `end_data`.
            - `start_date` must not be in the future (UTC).

        Returns:
            True if all looks good otherwise raises Exception.

        """

        try:
            start_date = self._get_localized_datetime(self.start_date)
        except ValueError:
            raise ValueError(
                "Unable to parse Start date. Please specify in YYYYMMDD format."
            )

        try:
            end_date = self._get_localized_datetime(self.end_date)
        except ValueError:
            raise ValueError(
                "Unable to parse End date. Please specify in YYYYMMDD format."
            )

        if start_date > self.current_date:
            raise ValueError("Start date can't be in future.")

        if start_date > end_date:
            raise ValueError("Start date can't be ahead of End date.")

        return True

PJMDailyLoadISOSource

Bases: BaseISOSource

The PJM Daily Load ISO Source is used to read daily load data from PJM API. It supports both Actual and Forecast data. Actual will return 1 day, Forecast will return 7 days.

To read more about the reports, visit the following URLs -
Actual doc: ops_sum_prev_period
Forecast doc: load_frcstd_7_day

Example

from rtdip_sdk.pipelines.sources import PJMDailyLoadISOSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

pjm_source = PJMDailyLoadISOSource(
    spark=spark,
    options={
        "api_key": "{api_key}",
        "load_type": "actual"
    }
)

pjm_source.read_batch()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations (See Attributes table below)

required

Attributes:

Name Type Description
api_key str

Must be a valid key from PJM, see api url

load_type str

Must be one of actual or forecast

Please check the BaseISOSource for available methods.

BaseISOSource

BaseISOSource

Bases: SourceInterface

Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations

required
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class BaseISOSource(SourceInterface):
    """
    Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations
    """

    spark: SparkSession
    options: dict
    iso_url: str = "https://"
    query_datetime_format: str = "%Y%m%d"
    required_options: list = []
    spark_schema = StructType([StructField("id", IntegerType(), True)])
    default_query_timezone: str = "UTC"

    def __init__(self, spark: SparkSession, options: dict) -> None:
        self.spark = spark
        self.options = options
        self.query_timezone = pytz.timezone(
            self.options.get("query_timezone", self.default_query_timezone)
        )
        self.current_date = datetime.now(timezone.utc).astimezone(self.query_timezone)

    def _fetch_from_url(self, url_suffix: str) -> bytes:
        """
        Gets data from external ISO API.

        Args:
            url_suffix: String to be used as suffix to iso url.

        Returns:
            Raw content of the data received.

        """
        url = f"{self.iso_url}{url_suffix}"
        logging.info(f"Requesting URL - {url}")

        response = requests.get(url)
        code = response.status_code

        if code != 200:
            raise HTTPError(
                f"Unable to access URL `{url}`."
                f" Received status code {code} with message {response.content}"
            )

        return response.content

    def _get_localized_datetime(self, datetime_str: str) -> datetime:
        """
        Converts string datetime into Python datetime object with configured format and timezone.
        Args:
            datetime_str: String to be converted into datetime.

        Returns: Timezone aware datetime object.

        """
        parsed_dt = datetime.strptime(datetime_str, self.query_datetime_format)
        parsed_dt = parsed_dt.replace(tzinfo=self.query_timezone)
        return parsed_dt

    def _pull_data(self) -> pd.DataFrame:
        """
        Hits the fetch_from_url method with certain parameters to get raw data from API.

        All the children ISO classes must override this method and call the fetch_url method
        in it.

        Returns:
             Raw DataFrame from API.
        """

        return pd.read_csv(BytesIO(self._fetch_from_url("")))

    def _prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Performs all the basic transformations to prepare data for further processing.
        All the children ISO classes must override this method.

        Args:
            df: Raw DataFrame, received from the API.

        Returns:
             Modified DataFrame, ready for basic use.

        """
        return df

    def _sanitize_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Another data transformation helper method to be called after prepare data.
        Used for advance data processing such as cleaning, filtering, restructuring.
        All the children ISO classes must override this method if there is any post-processing required.

        Args:
            df: Initial modified version of DataFrame, received after preparing the data.

        Returns:
             Final version of data after all the fixes and modifications.

        """
        return df

    def _get_data(self) -> pd.DataFrame:
        """
        Entrypoint method to return the final version of DataFrame.

        Returns:
            Modified form of data for specific use case.

        """
        df = self._pull_data()
        df = self._prepare_data(df)
        df = self._sanitize_data(df)

        # Reorder columns to keep the data consistent
        df = df[self.spark_schema.names]

        return df

    @staticmethod
    def system_type():
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def _validate_options(self) -> bool:
        """
        Performs all the options checks. Raises exception in case of any invalid value.
        Returns:
             True if all checks are passed.

        """
        return True

    def pre_read_validation(self) -> bool:
        """
        Ensures all the required options are provided and performs other validations.
        Returns:
             True if all checks are passed.

        """
        for key in self.required_options:
            if key not in self.options:
                raise ValueError(f"Required option `{key}` is missing.")

        return self._validate_options()

    def post_read_validation(self) -> bool:
        return True

    def read_batch(self) -> DataFrame:
        """
        Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
        Returns:
             Final Spark DataFrame converted from Pandas DataFrame post-execution.

        """

        try:
            self.pre_read_validation()
            pdf = self._get_data()
            pdf = _prepare_pandas_to_convert_to_spark(pdf)

            # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
            pd.DataFrame.iteritems = pd.DataFrame.items
            df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
            return df

        except Exception as e:
            logging.exception(str(e))
            raise e

    def read_stream(self) -> DataFrame:
        """
        By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

        Returns:
             Final Spark DataFrame after all the processing.

        """

        raise NotImplementedError(
            f"{self.__class__.__name__} connector doesn't support stream operation."
        )

pre_read_validation()

Ensures all the required options are provided and performs other validations. Returns: True if all checks are passed.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
175
176
177
178
179
180
181
182
183
184
185
186
def pre_read_validation(self) -> bool:
    """
    Ensures all the required options are provided and performs other validations.
    Returns:
         True if all checks are passed.

    """
    for key in self.required_options:
        if key not in self.options:
            raise ValueError(f"Required option `{key}` is missing.")

    return self._validate_options()

read_batch()

Spark entrypoint, It executes the entire process of pulling, transforming & fixing data. Returns: Final Spark DataFrame converted from Pandas DataFrame post-execution.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def read_batch(self) -> DataFrame:
    """
    Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
    Returns:
         Final Spark DataFrame converted from Pandas DataFrame post-execution.

    """

    try:
        self.pre_read_validation()
        pdf = self._get_data()
        pdf = _prepare_pandas_to_convert_to_spark(pdf)

        # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
        pd.DataFrame.iteritems = pd.DataFrame.items
        df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
        return df

    except Exception as e:
        logging.exception(str(e))
        raise e

read_stream()

By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

Returns:

Type Description
DataFrame

Final Spark DataFrame after all the processing.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
213
214
215
216
217
218
219
220
221
222
223
224
def read_stream(self) -> DataFrame:
    """
    By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

    Returns:
         Final Spark DataFrame after all the processing.

    """

    raise NotImplementedError(
        f"{self.__class__.__name__} connector doesn't support stream operation."
    )
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/pjm_daily_load_iso.py
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
class PJMDailyLoadISOSource(BaseISOSource):
    """
    The PJM Daily Load ISO Source is used to read daily load data from PJM API.
    It supports both Actual and Forecast data. Actual will return 1 day, Forecast will return 7 days.

    To read more about the reports, visit the following URLs -
    <br>
    Actual doc:    [ops_sum_prev_period](https://dataminer2.pjm.com/feed/ops_sum_prev_period/definition)
    <br>
    Forecast doc:  [load_frcstd_7_day](https://dataminer2.pjm.com/feed/load_frcstd_7_day/definition)

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.sources import PJMDailyLoadISOSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    pjm_source = PJMDailyLoadISOSource(
        spark=spark,
        options={
            "api_key": "{api_key}",
            "load_type": "actual"
        }
    )

    pjm_source.read_batch()
    ```

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations (See Attributes table below)

    Attributes:
        api_key (str): Must be a valid key from PJM, see api url
        load_type (str): Must be one of `actual` or `forecast`

    Please check the BaseISOSource for available methods.

    BaseISOSource:
        ::: src.sdk.python.rtdip_sdk.pipelines.sources.spark.iso.base_iso
    """

    spark: SparkSession
    spark_schema = PJM_SCHEMA
    options: dict
    iso_url: str = "https://api.pjm.com/api/v1/"
    query_datetime_format: str = "%Y-%m-%d %H:%M"
    required_options = ["api_key", "load_type"]
    default_query_timezone = "US/Eastern"

    def __init__(self, spark: SparkSession, options: dict) -> None:
        super().__init__(spark, options)
        self.spark: SparkSession = spark
        self.options: dict = options
        self.load_type: str = self.options.get("load_type", "").strip()
        self.api_key: str = self.options.get("api_key", "").strip()
        self.days: int = self.options.get("days", 7)

    def _fetch_from_url(self, url_suffix: str, start_date: str, end_date: str) -> bytes:
        """
        Gets data from external ISO API.

        Args:
            url_suffix: String to be used as suffix to iso url.

        Returns:
            Raw content of the data received.
        """

        url = f"{self.iso_url}{url_suffix}"
        headers = {"Ocp-Apim-Subscription-Key": self.api_key}
        logging.info(
            f"Requesting URL - {url}, start_date={start_date}, end_date={end_date}, load_type={self.load_type}"
        )
        load_key = (
            "datetime_beginning_ept"
            if self.load_type != "forecast"
            else "forecast_datetime_beginning_ept"
        )
        feed = (
            "ops_sum_prev_period"
            if self.load_type != "forecast"
            else "load_frcstd_7_day"
        )
        query = {
            "startRow": "1",
            load_key: f"{start_date}to{end_date}",
            "format": "csv",
            "download": "true",
        }
        query_s = "&".join(["=".join([k, v]) for k, v in query.items()])
        new_url = f"{url}{feed}?{query_s}"
        response = requests.get(new_url, headers=headers)
        code = response.status_code

        if code != 200:
            raise requests.HTTPError(
                f"Unable to access URL `{url}`."
                f" Received status code {code} with message {response.content}"
            )
        return response.content

    def _pull_data(self) -> pd.DataFrame:
        """
        Pulls data from the PJM API and parses the return.

        Returns:
            Raw form of data.
        """
        start_date = self.current_date - timedelta(days=1)
        start_date = start_date.replace(hour=0, minute=0)
        end_date = (start_date + timedelta(days=self.days)).replace(hour=23)
        start_date_str = start_date.strftime(self.query_datetime_format)
        end_date_str = end_date.strftime(self.query_datetime_format)
        df = pd.read_csv(
            BytesIO(self._fetch_from_url("", start_date_str, end_date_str))
        )

        return df

    def _prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Creates a new date time column and removes null values. Renames columns

        Args:
            df: Raw form of data received from the API.

        Returns:
            Data after basic transformations.

        """

        if self.load_type == "forecast":
            df = df.rename(
                columns={
                    "forecast_datetime_beginning_utc": "start_time",
                    "forecast_area": "zone",
                    "forecast_datetime_ending_utc": "end_time",
                    "forecast_load_mw": "load",
                }
            )
        else:
            df = df.rename(
                columns={
                    "datetime_beginning_utc": "start_time",
                    "area": "zone",
                    "datetime_ending_utc": "end_time",
                    "actual_load": "load",
                }
            )

        df = df[["start_time", "end_time", "zone", "load"]]
        df = df.replace({np.nan: None, "": None})

        date_cols = ["start_time", "end_time"]
        for col in date_cols:
            df[col] = pd.to_datetime(df[col], format="%m/%d/%Y %I:%M:%S %p")

        df["load"] = df["load"].astype(float)
        df = df.replace({np.nan: None, "": None})
        df.columns = list(map(lambda x: x.upper(), df.columns))

        rename_cols = {
            "START_TIME": "StartTime",
            "END_TIME": "EndTime",
            "ZONE": "Zone",
            "LOAD": "Load",
        }

        df = df.rename(columns=rename_cols)

        df.reset_index(inplace=True, drop=True)

        return df

    def _validate_options(self) -> bool:
        """
        Validates the following options:
            - `load_type` must be valid.

        Returns:
            True if all looks good otherwise raises Exception.
        """

        valid_load_types = ["actual", "forecast"]

        if self.load_type not in valid_load_types:
            raise ValueError(
                f"Invalid load_type `{self.load_type}` given. Supported values are {valid_load_types}."
            )

        return True

PJMDailyPricingISOSource

Bases: BaseISOSource

The PJM Daily Pricing ISO Source is used to retrieve Real-Time and Day-Ahead hourly data from PJM API. Real-Time will return data for T - 3 to T days and Day-Ahead will return T - 3 to T + 1 days data.

API: https://api.pjm.com/api/v1/ (must be a valid apy key from PJM)

Real-Time doc: https://dataminer2.pjm.com/feed/rt_hrl_lmps/definition

Day-Ahead doc: https://dataminer2.pjm.com/feed/da_hrl_lmps/definition

Example

from rtdip_sdk.pipelines.sources import PJMDailyPricingISOSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

pjm_source = PJMDailyPricingISOSource(
    spark=spark,
    options={
        "api_key": "{api_key}",
        "load_type": "real_time"
    }
)

pjm_source.read_batch()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations (See Attributes table below)

required

Attributes:

Name Type Description
api_key str

Must be a valid key from PJM, see api url

load_type str

Must be one of real_time or day_ahead

Please check the BaseISOSource for available methods.

BaseISOSource

BaseISOSource

Bases: SourceInterface

Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations

required
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class BaseISOSource(SourceInterface):
    """
    Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations
    """

    spark: SparkSession
    options: dict
    iso_url: str = "https://"
    query_datetime_format: str = "%Y%m%d"
    required_options: list = []
    spark_schema = StructType([StructField("id", IntegerType(), True)])
    default_query_timezone: str = "UTC"

    def __init__(self, spark: SparkSession, options: dict) -> None:
        self.spark = spark
        self.options = options
        self.query_timezone = pytz.timezone(
            self.options.get("query_timezone", self.default_query_timezone)
        )
        self.current_date = datetime.now(timezone.utc).astimezone(self.query_timezone)

    def _fetch_from_url(self, url_suffix: str) -> bytes:
        """
        Gets data from external ISO API.

        Args:
            url_suffix: String to be used as suffix to iso url.

        Returns:
            Raw content of the data received.

        """
        url = f"{self.iso_url}{url_suffix}"
        logging.info(f"Requesting URL - {url}")

        response = requests.get(url)
        code = response.status_code

        if code != 200:
            raise HTTPError(
                f"Unable to access URL `{url}`."
                f" Received status code {code} with message {response.content}"
            )

        return response.content

    def _get_localized_datetime(self, datetime_str: str) -> datetime:
        """
        Converts string datetime into Python datetime object with configured format and timezone.
        Args:
            datetime_str: String to be converted into datetime.

        Returns: Timezone aware datetime object.

        """
        parsed_dt = datetime.strptime(datetime_str, self.query_datetime_format)
        parsed_dt = parsed_dt.replace(tzinfo=self.query_timezone)
        return parsed_dt

    def _pull_data(self) -> pd.DataFrame:
        """
        Hits the fetch_from_url method with certain parameters to get raw data from API.

        All the children ISO classes must override this method and call the fetch_url method
        in it.

        Returns:
             Raw DataFrame from API.
        """

        return pd.read_csv(BytesIO(self._fetch_from_url("")))

    def _prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Performs all the basic transformations to prepare data for further processing.
        All the children ISO classes must override this method.

        Args:
            df: Raw DataFrame, received from the API.

        Returns:
             Modified DataFrame, ready for basic use.

        """
        return df

    def _sanitize_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Another data transformation helper method to be called after prepare data.
        Used for advance data processing such as cleaning, filtering, restructuring.
        All the children ISO classes must override this method if there is any post-processing required.

        Args:
            df: Initial modified version of DataFrame, received after preparing the data.

        Returns:
             Final version of data after all the fixes and modifications.

        """
        return df

    def _get_data(self) -> pd.DataFrame:
        """
        Entrypoint method to return the final version of DataFrame.

        Returns:
            Modified form of data for specific use case.

        """
        df = self._pull_data()
        df = self._prepare_data(df)
        df = self._sanitize_data(df)

        # Reorder columns to keep the data consistent
        df = df[self.spark_schema.names]

        return df

    @staticmethod
    def system_type():
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def _validate_options(self) -> bool:
        """
        Performs all the options checks. Raises exception in case of any invalid value.
        Returns:
             True if all checks are passed.

        """
        return True

    def pre_read_validation(self) -> bool:
        """
        Ensures all the required options are provided and performs other validations.
        Returns:
             True if all checks are passed.

        """
        for key in self.required_options:
            if key not in self.options:
                raise ValueError(f"Required option `{key}` is missing.")

        return self._validate_options()

    def post_read_validation(self) -> bool:
        return True

    def read_batch(self) -> DataFrame:
        """
        Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
        Returns:
             Final Spark DataFrame converted from Pandas DataFrame post-execution.

        """

        try:
            self.pre_read_validation()
            pdf = self._get_data()
            pdf = _prepare_pandas_to_convert_to_spark(pdf)

            # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
            pd.DataFrame.iteritems = pd.DataFrame.items
            df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
            return df

        except Exception as e:
            logging.exception(str(e))
            raise e

    def read_stream(self) -> DataFrame:
        """
        By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

        Returns:
             Final Spark DataFrame after all the processing.

        """

        raise NotImplementedError(
            f"{self.__class__.__name__} connector doesn't support stream operation."
        )

pre_read_validation()

Ensures all the required options are provided and performs other validations. Returns: True if all checks are passed.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
175
176
177
178
179
180
181
182
183
184
185
186
def pre_read_validation(self) -> bool:
    """
    Ensures all the required options are provided and performs other validations.
    Returns:
         True if all checks are passed.

    """
    for key in self.required_options:
        if key not in self.options:
            raise ValueError(f"Required option `{key}` is missing.")

    return self._validate_options()

read_batch()

Spark entrypoint, It executes the entire process of pulling, transforming & fixing data. Returns: Final Spark DataFrame converted from Pandas DataFrame post-execution.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def read_batch(self) -> DataFrame:
    """
    Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
    Returns:
         Final Spark DataFrame converted from Pandas DataFrame post-execution.

    """

    try:
        self.pre_read_validation()
        pdf = self._get_data()
        pdf = _prepare_pandas_to_convert_to_spark(pdf)

        # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
        pd.DataFrame.iteritems = pd.DataFrame.items
        df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
        return df

    except Exception as e:
        logging.exception(str(e))
        raise e

read_stream()

By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

Returns:

Type Description
DataFrame

Final Spark DataFrame after all the processing.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
213
214
215
216
217
218
219
220
221
222
223
224
def read_stream(self) -> DataFrame:
    """
    By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

    Returns:
         Final Spark DataFrame after all the processing.

    """

    raise NotImplementedError(
        f"{self.__class__.__name__} connector doesn't support stream operation."
    )
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/pjm_daily_pricing_iso.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
class PJMDailyPricingISOSource(BaseISOSource):
    """
    The PJM Daily Pricing ISO Source is used to retrieve Real-Time and Day-Ahead hourly data from PJM API.
    Real-Time will return data for T - 3 to T days and Day-Ahead will return T - 3 to T + 1 days data.

    API:             <a href="https://api.pjm.com/api/v1/">https://api.pjm.com/api/v1/</a>  (must be a valid apy key from PJM)

    Real-Time doc:    <a href="https://dataminer2.pjm.com/feed/rt_hrl_lmps/definition">https://dataminer2.pjm.com/feed/rt_hrl_lmps/definition</a>

    Day-Ahead doc:    <a href="https://dataminer2.pjm.com/feed/da_hrl_lmps/definition">https://dataminer2.pjm.com/feed/da_hrl_lmps/definition</a>

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.sources import PJMDailyPricingISOSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    pjm_source = PJMDailyPricingISOSource(
        spark=spark,
        options={
            "api_key": "{api_key}",
            "load_type": "real_time"
        }
    )

    pjm_source.read_batch()
    ```

    Parameters:
       spark (SparkSession): Spark Session instance
       options (dict): A dictionary of ISO Source specific configurations (See Attributes table below)

    Attributes:
        api_key (str): Must be a valid key from PJM, see api url
        load_type (str): Must be one of `real_time` or `day_ahead`

    Please check the BaseISOSource for available methods.

    BaseISOSource:
        ::: src.sdk.python.rtdip_sdk.pipelines.sources.spark.iso.base_iso
    """

    spark: SparkSession
    spark_schema = PJM_PRICING_SCHEMA
    options: dict
    iso_url: str = "https://api.pjm.com/api/v1/"
    query_datetime_format: str = "%Y-%m-%d %H:%M"
    required_options = ["api_key", "load_type"]
    default_query_timezone = "US/Eastern"

    def __init__(self, spark: SparkSession, options: dict) -> None:
        super().__init__(spark, options)
        self.spark: SparkSession = spark
        self.options: dict = options
        self.load_type: str = self.options.get("load_type", "").strip()
        self.api_key: str = self.options.get("api_key", "").strip()
        self.days: int = self.options.get("days", 3)

    def _fetch_paginated_data(
        self, url_suffix: str, start_date: str, end_date: str
    ) -> bytes:
        """
        Fetches data from the PJM API with pagination support.

        Args:
            url_suffix: String to be used as suffix to ISO URL.
            start_date: Start date for the data retrieval.
            end_date: End date for the data retrieval.

        Returns:
            Raw content of the data received.
        """
        headers = {"Ocp-Apim-Subscription-Key": self.api_key}
        items = []
        query = {
            "startRow": "1",
            "rowCount": "5",
            "datetime_beginning_ept": f"{start_date}to{end_date}",
        }
        query_s = "&".join(["=".join([k, v]) for k, v in query.items()])
        base_url = f"{self.iso_url}{url_suffix}?{query_s}"

        next_page = base_url

        logging.info(
            f"Requesting URL - {base_url}, start_date={start_date}, end_date={end_date}, load_type={self.load_type}"
        )

        while next_page:
            now = datetime.now()
            logging.info(f"Timestamp: {now}")
            response = requests.get(next_page, headers=headers)
            code = response.status_code

            if code != 200:
                raise requests.HTTPError(
                    f"Unable to access URL `{next_page}`."
                    f" Received status code {code} with message {response.content}"
                )

            data = response.json()

            logging.info(f"Data for page {next_page}:")
            items.extend(data["items"])
            next_urls = list(filter(lambda item: item["rel"] == "next", data["links"]))
            next_page = next_urls[0]["href"] if next_urls else None
            time.sleep(10)

        return items

    def _pull_data(self) -> pd.DataFrame:
        """
        Pulls data from the PJM API and parses the return.

        Returns:
            Raw form of data.
        """
        start_date = self.current_date - timedelta(self.days)
        start_date = start_date.replace(hour=0, minute=0)
        end_date = (start_date + timedelta(days=self.days)).replace(hour=23)
        start_date_str = start_date.strftime(self.query_datetime_format)
        end_date_str = end_date.strftime(self.query_datetime_format)

        if self.load_type == "day_ahead":
            url_suffix = "da_hrl_lmps"
        else:
            url_suffix = "rt_hrl_lmps"

        data = self._fetch_paginated_data(url_suffix, start_date_str, end_date_str)

        df = pd.DataFrame(data)
        logging.info(f"Data fetched successfully: {len(df)} rows")

        return df

    def _prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Creates a new date time column and removes null values. Renames columns

        Args:
            df: Raw form of data received from the API.

        Returns:
            Data after basic transformations.

        """

        if self.load_type == "day_ahead":
            df = df.rename(
                columns={
                    "datetime_beginning_utc": "StartTime",
                    "pnode_id": "PnodeId",
                    "pnode_name": "PnodeName",
                    "voltage": "Voltage",
                    "equipment": "Equipment",
                    "type": "Type",
                    "zone": "Zone",
                    "system_energy_price_da": "SystemEnergyPrice",
                    "total_lmp_da": "TotalLmp",
                    "congestion_price_da": "CongestionPrice",
                    "marginal_loss_price_da": "MarginalLossPrice",
                    "version_nbr": "VersionNbr",
                }
            )
        else:
            df = df.rename(
                columns={
                    "datetime_beginning_utc": "StartTime",
                    "pnode_id": "PnodeId",
                    "pnode_name": "PnodeName",
                    "voltage": "Voltage",
                    "equipment": "Equipment",
                    "type": "Type",
                    "zone": "Zone",
                    "system_energy_price_rt": "SystemEnergyPrice",
                    "total_lmp_rt": "TotalLmp",
                    "congestion_price_rt": "CongestionPrice",
                    "marginal_loss_price_rt": "MarginalLossPrice",
                    "version_nbr": "VersionNbr",
                }
            )

        df = df[
            [
                "StartTime",
                "PnodeId",
                "PnodeName",
                "Voltage",
                "Equipment",
                "Type",
                "Zone",
                "SystemEnergyPrice",
                "TotalLmp",
                "CongestionPrice",
                "MarginalLossPrice",
                "VersionNbr",
            ]
        ]

        df = df.replace({np.nan: None, "": None})

        df["StartTime"] = pd.to_datetime(df["StartTime"])
        df = df.replace({np.nan: None, "": None})

        df.reset_index(inplace=True, drop=True)

        return df

    def _validate_options(self) -> bool:
        """
        Validates the following options:
            - `load_type` must be valid.

        Returns:
            True if all looks good otherwise raises Exception.
        """

        valid_load_types = ["real_time", "day_ahead"]

        if self.load_type not in valid_load_types:
            raise ValueError(
                f"Invalid load_type `{self.load_type}` given. Supported values are {valid_load_types}."
            )

        return True

PJMHistoricalPricingISOSource

Bases: PJMDailyPricingISOSource

The PJM Historical Pricing ISO Source is used to retrieve historical Real-Time and Day-Ahead hourly data from the PJM API.

API: https://api.pjm.com/api/v1/ (must be a valid apy key from PJM)

Real-Time doc: https://dataminer2.pjm.com/feed/rt_hrl_lmps/definition

Day-Ahead doc: https://dataminer2.pjm.com/feed/da_hrl_lmps/definition

The PJM Historical Pricing ISO Source accesses the same PJM endpoints as the daily pricing source but is tailored for retrieving data within a specified historical range defined by the start_date and end_date attributes.

Example

from rtdip_sdk.pipelines.sources import PJMHistoricalPricingISOSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

pjm_source = PJMHistoricalPricingISOSource(
    spark=spark,
    options={
        "api_key": "{api_key}",
        "start_date": "2023-05-10",
        "end_date": "2023-05-20",
    }
)

pjm_source.read_batch()

Parameters:

Name Type Description Default
spark SparkSession

The Spark Session instance.

required
options dict

A dictionary of ISO Source specific configurations.

required

Attributes:

Name Type Description
api_key str

A valid key from PJM required for authentication.

load_type str

The type of data to retrieve, either real_time or day_ahead.

start_date str

Must be in YYYY-MM-DD format.

end_date str

Must be in YYYY-MM-DD format.

Please refer to the BaseISOSource for available methods and further details.

BaseISOSource: ::: src.sdk.python.rtdip_sdk.pipelines.sources.spark.iso.base_iso

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/pjm_historical_pricing_iso.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
class PJMHistoricalPricingISOSource(PJMDailyPricingISOSource):
    """
    The PJM Historical Pricing ISO Source is used to retrieve historical Real-Time and Day-Ahead hourly data from the PJM API.

    API:             <a href="https://api.pjm.com/api/v1/">https://api.pjm.com/api/v1/</a>  (must be a valid apy key from PJM)

    Real-Time doc:    <a href="https://dataminer2.pjm.com/feed/rt_hrl_lmps/definition">https://dataminer2.pjm.com/feed/rt_hrl_lmps/definition</a>

    Day-Ahead doc:    <a href="https://dataminer2.pjm.com/feed/da_hrl_lmps/definition">https://dataminer2.pjm.com/feed/da_hrl_lmps/definition</a>

    The PJM Historical Pricing ISO Source accesses the same PJM endpoints as the daily pricing source but is tailored for retrieving data within a specified historical range defined by the `start_date` and `end_date` attributes.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.sources import PJMHistoricalPricingISOSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    pjm_source = PJMHistoricalPricingISOSource(
        spark=spark,
        options={
            "api_key": "{api_key}",
            "start_date": "2023-05-10",
            "end_date": "2023-05-20",
        }
    )

    pjm_source.read_batch()
    ```

    Parameters:
        spark (SparkSession): The Spark Session instance.
        options (dict): A dictionary of ISO Source specific configurations.

    Attributes:
        api_key (str): A valid key from PJM required for authentication.
        load_type (str): The type of data to retrieve, either `real_time` or `day_ahead`.
        start_date (str): Must be in `YYYY-MM-DD` format.
        end_date (str): Must be in `YYYY-MM-DD` format.

    Please refer to the BaseISOSource for available methods and further details.

    BaseISOSource: ::: src.sdk.python.rtdip_sdk.pipelines.sources.spark.iso.base_iso"""

    spark: SparkSession
    options: dict
    required_options = ["api_key", "load_type", "start_date", "end_date"]

    def __init__(self, spark: SparkSession, options: dict) -> None:
        super().__init__(spark, options)
        self.spark: SparkSession = spark
        self.options: dict = options
        self.start_date: str = self.options.get("start_date", "")
        self.end_date: str = self.options.get("end_date", "")
        self.user_datetime_format = "%Y-%m-%d"

    def _pull_data(self) -> pd.DataFrame:
        """
        Pulls historical pricing data from the PJM API within the specified date range.

        Returns:
            pd.DataFrame: A DataFrame containing the raw historical pricing data retrieved from the PJM API.
        """

        logging.info(
            f"Historical data requested from {self.start_date} to {self.end_date}"
        )

        start_date_str = datetime.strptime(
            self.start_date, self.user_datetime_format
        ).replace(hour=0, minute=0)
        end_date_str = datetime.strptime(
            self.end_date, self.user_datetime_format
        ).replace(hour=23)

        if self.load_type == "day_ahead":
            url_suffix = "da_hrl_lmps"
        else:
            url_suffix = "rt_hrl_lmps"

        data = self._fetch_paginated_data(url_suffix, start_date_str, end_date_str)

        df = pd.DataFrame(data)
        logging.info(f"Data fetched successfully: {len(df)} rows")

        return df

    def _validate_options(self) -> bool:
        """
        Validates all parameters including the following examples:
            - `start_date` & `end_data` must be in the correct format.
            - `start_date` must be behind `end_data`.
            - `start_date` must not be in the future (UTC).

        Returns:
            True if all looks good otherwise raises Exception.

        """
        super()._validate_options()
        try:
            start_date = datetime.strptime(self.start_date, self.user_datetime_format)
        except ValueError:
            raise ValueError(
                f"Unable to parse Start date. Please specify in {self.user_datetime_format} format."
            )

        try:
            end_date = datetime.strptime(self.end_date, self.user_datetime_format)
        except ValueError:
            raise ValueError(
                f"Unable to parse End date. Please specify in {self.user_datetime_format} format."
            )

        if start_date > datetime.now(timezone.utc).replace(tzinfo=None) - timedelta(
            days=1
        ):
            raise ValueError("Start date can't be in future.")

        if start_date > end_date:
            raise ValueError("Start date can't be ahead of End date.")

        if end_date > datetime.now(timezone.utc).replace(tzinfo=None) - timedelta(
            days=1
        ):
            raise ValueError("End date can't be in future.")

        return True

PJMHistoricalLoadISOSource

Bases: PJMDailyLoadISOSource

The PJM Historical Load ISO Source is used to read historical load data from PJM API.

To read more about the reports, visit the following URLs -
Actual doc: ops_sum_prev_period
Forecast doc: load_frcstd_7_day

Historical is the same PJM endpoint as Actual, but is called repeatedly within a range established by the start_date & end_date attributes

Example

from rtdip_sdk.pipelines.sources import PJMHistoricalLoadISOSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

pjm_source = PJMHistoricalLoadISOSource(
    spark=spark,
    options={
        "api_key": "{api_key}",
        "start_date": "20230510",
        "end_date": "20230520",
    }
)

pjm_source.read_batch()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations (See Attributes table below)

required

Attributes:

Name Type Description
api_key str

Must be a valid key from PJM, see PJM documentation

start_date str

Must be in YYYY-MM-DD format.

end_date str

Must be in YYYY-MM-DD format.

query_batch_days int

(optional) Number of days must be < 160 as per PJM & is defaulted to 120

sleep_duration int

(optional) Number of seconds to sleep between request, defaulted to 5 seconds, used to manage requests to PJM endpoint

request_count int

(optional) Number of requests made to PJM endpoint before sleep_duration, currently defaulted to 1

Please check the BaseISOSource for available methods.

BaseISOSource

BaseISOSource

Bases: SourceInterface

Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations

required
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class BaseISOSource(SourceInterface):
    """
    Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations
    """

    spark: SparkSession
    options: dict
    iso_url: str = "https://"
    query_datetime_format: str = "%Y%m%d"
    required_options: list = []
    spark_schema = StructType([StructField("id", IntegerType(), True)])
    default_query_timezone: str = "UTC"

    def __init__(self, spark: SparkSession, options: dict) -> None:
        self.spark = spark
        self.options = options
        self.query_timezone = pytz.timezone(
            self.options.get("query_timezone", self.default_query_timezone)
        )
        self.current_date = datetime.now(timezone.utc).astimezone(self.query_timezone)

    def _fetch_from_url(self, url_suffix: str) -> bytes:
        """
        Gets data from external ISO API.

        Args:
            url_suffix: String to be used as suffix to iso url.

        Returns:
            Raw content of the data received.

        """
        url = f"{self.iso_url}{url_suffix}"
        logging.info(f"Requesting URL - {url}")

        response = requests.get(url)
        code = response.status_code

        if code != 200:
            raise HTTPError(
                f"Unable to access URL `{url}`."
                f" Received status code {code} with message {response.content}"
            )

        return response.content

    def _get_localized_datetime(self, datetime_str: str) -> datetime:
        """
        Converts string datetime into Python datetime object with configured format and timezone.
        Args:
            datetime_str: String to be converted into datetime.

        Returns: Timezone aware datetime object.

        """
        parsed_dt = datetime.strptime(datetime_str, self.query_datetime_format)
        parsed_dt = parsed_dt.replace(tzinfo=self.query_timezone)
        return parsed_dt

    def _pull_data(self) -> pd.DataFrame:
        """
        Hits the fetch_from_url method with certain parameters to get raw data from API.

        All the children ISO classes must override this method and call the fetch_url method
        in it.

        Returns:
             Raw DataFrame from API.
        """

        return pd.read_csv(BytesIO(self._fetch_from_url("")))

    def _prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Performs all the basic transformations to prepare data for further processing.
        All the children ISO classes must override this method.

        Args:
            df: Raw DataFrame, received from the API.

        Returns:
             Modified DataFrame, ready for basic use.

        """
        return df

    def _sanitize_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Another data transformation helper method to be called after prepare data.
        Used for advance data processing such as cleaning, filtering, restructuring.
        All the children ISO classes must override this method if there is any post-processing required.

        Args:
            df: Initial modified version of DataFrame, received after preparing the data.

        Returns:
             Final version of data after all the fixes and modifications.

        """
        return df

    def _get_data(self) -> pd.DataFrame:
        """
        Entrypoint method to return the final version of DataFrame.

        Returns:
            Modified form of data for specific use case.

        """
        df = self._pull_data()
        df = self._prepare_data(df)
        df = self._sanitize_data(df)

        # Reorder columns to keep the data consistent
        df = df[self.spark_schema.names]

        return df

    @staticmethod
    def system_type():
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def _validate_options(self) -> bool:
        """
        Performs all the options checks. Raises exception in case of any invalid value.
        Returns:
             True if all checks are passed.

        """
        return True

    def pre_read_validation(self) -> bool:
        """
        Ensures all the required options are provided and performs other validations.
        Returns:
             True if all checks are passed.

        """
        for key in self.required_options:
            if key not in self.options:
                raise ValueError(f"Required option `{key}` is missing.")

        return self._validate_options()

    def post_read_validation(self) -> bool:
        return True

    def read_batch(self) -> DataFrame:
        """
        Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
        Returns:
             Final Spark DataFrame converted from Pandas DataFrame post-execution.

        """

        try:
            self.pre_read_validation()
            pdf = self._get_data()
            pdf = _prepare_pandas_to_convert_to_spark(pdf)

            # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
            pd.DataFrame.iteritems = pd.DataFrame.items
            df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
            return df

        except Exception as e:
            logging.exception(str(e))
            raise e

    def read_stream(self) -> DataFrame:
        """
        By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

        Returns:
             Final Spark DataFrame after all the processing.

        """

        raise NotImplementedError(
            f"{self.__class__.__name__} connector doesn't support stream operation."
        )

pre_read_validation()

Ensures all the required options are provided and performs other validations. Returns: True if all checks are passed.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
175
176
177
178
179
180
181
182
183
184
185
186
def pre_read_validation(self) -> bool:
    """
    Ensures all the required options are provided and performs other validations.
    Returns:
         True if all checks are passed.

    """
    for key in self.required_options:
        if key not in self.options:
            raise ValueError(f"Required option `{key}` is missing.")

    return self._validate_options()

read_batch()

Spark entrypoint, It executes the entire process of pulling, transforming & fixing data. Returns: Final Spark DataFrame converted from Pandas DataFrame post-execution.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def read_batch(self) -> DataFrame:
    """
    Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
    Returns:
         Final Spark DataFrame converted from Pandas DataFrame post-execution.

    """

    try:
        self.pre_read_validation()
        pdf = self._get_data()
        pdf = _prepare_pandas_to_convert_to_spark(pdf)

        # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
        pd.DataFrame.iteritems = pd.DataFrame.items
        df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
        return df

    except Exception as e:
        logging.exception(str(e))
        raise e

read_stream()

By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

Returns:

Type Description
DataFrame

Final Spark DataFrame after all the processing.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
213
214
215
216
217
218
219
220
221
222
223
224
def read_stream(self) -> DataFrame:
    """
    By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

    Returns:
         Final Spark DataFrame after all the processing.

    """

    raise NotImplementedError(
        f"{self.__class__.__name__} connector doesn't support stream operation."
    )
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/pjm_historical_load_iso.py
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
class PJMHistoricalLoadISOSource(PJMDailyLoadISOSource):
    """
    The PJM Historical Load ISO Source is used to read historical load data from PJM API.

    To read more about the reports, visit the following URLs -
    <br>
    Actual doc:    [ops_sum_prev_period](https://dataminer2.pjm.com/feed/ops_sum_prev_period/definition)
    <br>
    Forecast doc:  [load_frcstd_7_day](https://dataminer2.pjm.com/feed/load_frcstd_7_day/definition)

    Historical is the same PJM endpoint as Actual, but is called repeatedly within a range established by the
    start_date & end_date attributes

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.sources import PJMHistoricalLoadISOSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    pjm_source = PJMHistoricalLoadISOSource(
        spark=spark,
        options={
            "api_key": "{api_key}",
            "start_date": "20230510",
            "end_date": "20230520",
        }
    )

    pjm_source.read_batch()
    ```

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations (See Attributes table below)

    Attributes:
        api_key (str): Must be a valid key from PJM, see PJM documentation
        start_date (str): Must be in `YYYY-MM-DD` format.
        end_date (str): Must be in `YYYY-MM-DD` format.

        query_batch_days (int): (optional) Number of days must be < 160 as per PJM & is defaulted to `120`
        sleep_duration (int): (optional) Number of seconds to sleep between request, defaulted to `5` seconds, used to manage requests to PJM endpoint
        request_count (int): (optional) Number of requests made to PJM endpoint before sleep_duration, currently defaulted to `1`

    Please check the BaseISOSource for available methods.

    BaseISOSource:
        ::: src.sdk.python.rtdip_sdk.pipelines.sources.spark.iso.base_iso"""

    spark: SparkSession
    options: dict
    required_options = ["api_key", "start_date", "end_date"]

    def __init__(self, spark: SparkSession, options: dict) -> None:
        super().__init__(spark, options)
        self.spark: SparkSession = spark
        self.options: dict = options
        self.api_key: str = self.options.get("api_key", "").strip()
        self.start_date: str = self.options.get("start_date", "")
        self.end_date: str = self.options.get("end_date", "")
        self.query_batch_days: int = self.options.get("query_batch_days", 120)
        self.sleep_duration: int = self.options.get("sleep_duration", 5)
        self.request_count: int = self.options.get("request_count", 1)
        self.load_type: str = "actual"
        self.user_datetime_format = "%Y-%m-%d"

    def _pull_data(self) -> pd.DataFrame:
        """
        Pulls data from the PJM API and parses the return including date ranges.

        Returns:
            Raw form of data.
        """

        logging.info(
            f"Historical load requested from {self.start_date} to {self.end_date}"
        )
        start_date = datetime.strptime(self.start_date, self.user_datetime_format)
        end_date = datetime.strptime(self.end_date, self.user_datetime_format).replace(
            hour=23
        )

        days_diff = (end_date - start_date).days
        logging.info(f"Expected hours for a single zone = {(days_diff + 1) * 24}")
        generated_days_ranges = []
        dates = pd.date_range(
            start_date, end_date, freq=pd.DateOffset(days=self.query_batch_days)
        )

        for date in dates:
            py_date = date.to_pydatetime()
            date_last = (py_date + timedelta(days=self.query_batch_days - 1)).replace(
                hour=23
            )
            date_last = min(date_last, end_date)
            generated_days_ranges.append((py_date, date_last))

        logging.info(
            f"Generated date ranges for batch days {self.query_batch_days} are {generated_days_ranges}"
        )

        # Collect all historical data on yearly basis.
        dfs = []
        for idx, date_range in enumerate(generated_days_ranges):
            start_date_str = date_range[0].strftime(self.query_datetime_format)
            end_date_str = date_range[1].strftime(self.query_datetime_format)

            df = pd.read_csv(
                BytesIO(self._fetch_from_url("", start_date_str, end_date_str))
            )
            dfs.append(df)

            if idx > 0 and idx % self.request_count == 0:
                logging.info(f"Going to sleep for {self.sleep_duration} seconds")
                time.sleep(self.sleep_duration)

        df = pd.concat(dfs, sort=False)
        df = df.reset_index(drop=True)
        return df

    def _validate_options(self) -> bool:
        """
        Validates all parameters including the following examples:
            - `start_date` & `end_data` must be in the correct format.
            - `start_date` must be behind `end_data`.
            - `start_date` must not be in the future (UTC).

        Returns:
            True if all looks good otherwise raises Exception.

        """

        try:
            start_date = datetime.strptime(self.start_date, self.user_datetime_format)
        except ValueError:
            raise ValueError(
                f"Unable to parse Start date. Please specify in {self.user_datetime_format} format."
            )

        try:
            end_date = datetime.strptime(self.end_date, self.user_datetime_format)
        except ValueError:
            raise ValueError(
                f"Unable to parse End date. Please specify in {self.user_datetime_format} format."
            )

        if start_date > datetime.now(timezone.utc).replace(tzinfo=None) - timedelta(
            days=1
        ):
            raise ValueError("Start date can't be in future.")

        if start_date > end_date:
            raise ValueError("Start date can't be ahead of End date.")

        if end_date > datetime.now(timezone.utc).replace(tzinfo=None) - timedelta(
            days=1
        ):
            raise ValueError("End date can't be in future.")

        if self.sleep_duration < 0:
            raise ValueError("Sleep duration can't be negative.")

        if self.request_count < 0:
            raise ValueError("Request count can't be negative.")

        if self.query_batch_days < 0:
            raise ValueError("Query batch days count can't be negative.")

        return True

CAISODailyLoadISOSource

Bases: BaseISOSource

The CAISO Daily Load ISO Source is used to read daily load data from CAISO API. It supports multiple types of data. Check the load_types attribute.

To read more about the available reports from CAISO API, download the file - Interface Specification

From the list of reports in the file, it pulls the report named CAISO Demand Forecast in the file.

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations (See Attributes table below)

required

Attributes:

Name Type Description
load_types list

Must be a subset of [Demand Forecast 7-Day Ahead, Demand Forecast 2-Day Ahead, Demand Forecast Day Ahead, RTM 15Min Load Forecast, RTM 5Min Load Forecast, Total Actual Hourly Integrated Load].
Default Value - [Total Actual Hourly Integrated Load].

date str

Must be in YYYY-MM-DD format.

Please check the BaseISOSource for available methods.

BaseISOSource

BaseISOSource

Bases: SourceInterface

Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations

required
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class BaseISOSource(SourceInterface):
    """
    Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations
    """

    spark: SparkSession
    options: dict
    iso_url: str = "https://"
    query_datetime_format: str = "%Y%m%d"
    required_options: list = []
    spark_schema = StructType([StructField("id", IntegerType(), True)])
    default_query_timezone: str = "UTC"

    def __init__(self, spark: SparkSession, options: dict) -> None:
        self.spark = spark
        self.options = options
        self.query_timezone = pytz.timezone(
            self.options.get("query_timezone", self.default_query_timezone)
        )
        self.current_date = datetime.now(timezone.utc).astimezone(self.query_timezone)

    def _fetch_from_url(self, url_suffix: str) -> bytes:
        """
        Gets data from external ISO API.

        Args:
            url_suffix: String to be used as suffix to iso url.

        Returns:
            Raw content of the data received.

        """
        url = f"{self.iso_url}{url_suffix}"
        logging.info(f"Requesting URL - {url}")

        response = requests.get(url)
        code = response.status_code

        if code != 200:
            raise HTTPError(
                f"Unable to access URL `{url}`."
                f" Received status code {code} with message {response.content}"
            )

        return response.content

    def _get_localized_datetime(self, datetime_str: str) -> datetime:
        """
        Converts string datetime into Python datetime object with configured format and timezone.
        Args:
            datetime_str: String to be converted into datetime.

        Returns: Timezone aware datetime object.

        """
        parsed_dt = datetime.strptime(datetime_str, self.query_datetime_format)
        parsed_dt = parsed_dt.replace(tzinfo=self.query_timezone)
        return parsed_dt

    def _pull_data(self) -> pd.DataFrame:
        """
        Hits the fetch_from_url method with certain parameters to get raw data from API.

        All the children ISO classes must override this method and call the fetch_url method
        in it.

        Returns:
             Raw DataFrame from API.
        """

        return pd.read_csv(BytesIO(self._fetch_from_url("")))

    def _prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Performs all the basic transformations to prepare data for further processing.
        All the children ISO classes must override this method.

        Args:
            df: Raw DataFrame, received from the API.

        Returns:
             Modified DataFrame, ready for basic use.

        """
        return df

    def _sanitize_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Another data transformation helper method to be called after prepare data.
        Used for advance data processing such as cleaning, filtering, restructuring.
        All the children ISO classes must override this method if there is any post-processing required.

        Args:
            df: Initial modified version of DataFrame, received after preparing the data.

        Returns:
             Final version of data after all the fixes and modifications.

        """
        return df

    def _get_data(self) -> pd.DataFrame:
        """
        Entrypoint method to return the final version of DataFrame.

        Returns:
            Modified form of data for specific use case.

        """
        df = self._pull_data()
        df = self._prepare_data(df)
        df = self._sanitize_data(df)

        # Reorder columns to keep the data consistent
        df = df[self.spark_schema.names]

        return df

    @staticmethod
    def system_type():
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def _validate_options(self) -> bool:
        """
        Performs all the options checks. Raises exception in case of any invalid value.
        Returns:
             True if all checks are passed.

        """
        return True

    def pre_read_validation(self) -> bool:
        """
        Ensures all the required options are provided and performs other validations.
        Returns:
             True if all checks are passed.

        """
        for key in self.required_options:
            if key not in self.options:
                raise ValueError(f"Required option `{key}` is missing.")

        return self._validate_options()

    def post_read_validation(self) -> bool:
        return True

    def read_batch(self) -> DataFrame:
        """
        Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
        Returns:
             Final Spark DataFrame converted from Pandas DataFrame post-execution.

        """

        try:
            self.pre_read_validation()
            pdf = self._get_data()
            pdf = _prepare_pandas_to_convert_to_spark(pdf)

            # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
            pd.DataFrame.iteritems = pd.DataFrame.items
            df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
            return df

        except Exception as e:
            logging.exception(str(e))
            raise e

    def read_stream(self) -> DataFrame:
        """
        By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

        Returns:
             Final Spark DataFrame after all the processing.

        """

        raise NotImplementedError(
            f"{self.__class__.__name__} connector doesn't support stream operation."
        )

pre_read_validation()

Ensures all the required options are provided and performs other validations. Returns: True if all checks are passed.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
175
176
177
178
179
180
181
182
183
184
185
186
def pre_read_validation(self) -> bool:
    """
    Ensures all the required options are provided and performs other validations.
    Returns:
         True if all checks are passed.

    """
    for key in self.required_options:
        if key not in self.options:
            raise ValueError(f"Required option `{key}` is missing.")

    return self._validate_options()

read_batch()

Spark entrypoint, It executes the entire process of pulling, transforming & fixing data. Returns: Final Spark DataFrame converted from Pandas DataFrame post-execution.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def read_batch(self) -> DataFrame:
    """
    Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
    Returns:
         Final Spark DataFrame converted from Pandas DataFrame post-execution.

    """

    try:
        self.pre_read_validation()
        pdf = self._get_data()
        pdf = _prepare_pandas_to_convert_to_spark(pdf)

        # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
        pd.DataFrame.iteritems = pd.DataFrame.items
        df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
        return df

    except Exception as e:
        logging.exception(str(e))
        raise e

read_stream()

By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

Returns:

Type Description
DataFrame

Final Spark DataFrame after all the processing.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
213
214
215
216
217
218
219
220
221
222
223
224
def read_stream(self) -> DataFrame:
    """
    By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

    Returns:
         Final Spark DataFrame after all the processing.

    """

    raise NotImplementedError(
        f"{self.__class__.__name__} connector doesn't support stream operation."
    )
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/caiso_daily_load_iso.py
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
class CAISODailyLoadISOSource(BaseISOSource):
    """
    The CAISO Daily Load ISO Source is used to read daily load data from CAISO API.
    It supports multiple types of data. Check the `load_types` attribute.

    To read more about the available reports from CAISO API, download the file -
     [Interface Specification](https://www.caiso.com/Documents/OASISAPISpecification.pdf)

    From the list of reports in the file, it pulls the report named `CAISO Demand Forecast` in the file.

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations (See Attributes table below)

    Attributes:
        load_types (list): Must be a subset of [`Demand Forecast 7-Day Ahead`, `Demand Forecast 2-Day Ahead`, `Demand Forecast Day Ahead`, `RTM 15Min Load Forecast`, `RTM 5Min Load Forecast`, `Total Actual Hourly Integrated Load`]. <br> Default Value - `[Total Actual Hourly Integrated Load]`.
        date (str): Must be in `YYYY-MM-DD` format.

    Please check the BaseISOSource for available methods.

    BaseISOSource:
        ::: src.sdk.python.rtdip_sdk.pipelines.sources.spark.iso.base_iso
    """

    spark: SparkSession
    options: dict
    iso_url: str = "https://oasis.caiso.com/oasisapi/SingleZip"
    query_datetime_format: str = "%Y%m%dT00:00-0000"
    required_options = ["load_types", "date"]
    spark_schema = CAISO_SCHEMA
    default_query_timezone = "UTC"

    def __init__(self, spark: SparkSession, options: dict) -> None:
        super().__init__(spark, options)
        self.spark = spark
        self.options = options
        self.load_types = self.options.get(
            "load_types", ["Total Actual Hourly Integrated Load"]
        )
        self.date = self.options.get("date", "").strip()
        self.user_datetime_format = "%Y-%m-%d"

        # The following to fix the Security Check Error as the CAISO API is timing out with HTTPS protocol.
        self.iso_url = self.iso_url.replace("s://", "://")

    def _pull_data(self) -> pd.DataFrame:
        """
        Pulls data from the CAISO API and parses the zip files for CSV data.

        Returns:
            Raw form of data.
        """

        logging.info(f"Getting {self.load_types} data for date {self.date}")
        start_date = datetime.strptime(self.date, self.user_datetime_format)
        end_date = start_date + timedelta(days=1)
        return self._fetch_and_parse_zip(start_date, end_date)

    def _fetch_and_parse_zip(
        self, start_date: datetime, end_date: datetime
    ) -> pd.DataFrame:
        suffix = (
            f"?resultformat=6&"
            f"queryname=SLD_FCST&"
            "version=1&"
            f"startdatetime={start_date.strftime(self.query_datetime_format)}&"
            f"enddatetime={end_date.strftime(self.query_datetime_format)}"
        )

        content = self._fetch_from_url(suffix)
        if not content:
            raise HTTPError("Empty Response was returned")
        logging.info("Unzipping the file")

        zf = ZipFile(BytesIO(content))

        csvs = list(filter(lambda name: ".csv" in name, zf.namelist()))
        if len(csvs) == 0:
            raise ValueError("No data was found in the specified interval")

        df = pd.read_csv(zf.open(csvs[0]))
        return df

    def _prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        date_cols = ["INTERVALSTARTTIME_GMT", "INTERVALENDTIME_GMT"]
        for date_col in date_cols:
            df[date_col] = df[date_col].apply(
                lambda data: datetime.strptime(str(data)[:19], "%Y-%m-%dT%H:%M:%S")
            )

        df = df.rename(
            columns={
                "INTERVALSTARTTIME_GMT": "StartTime",
                "INTERVALENDTIME_GMT": "EndTime",
                "LOAD_TYPE": "LoadType",
                "OPR_DT": "OprDt",
                "OPR_HR": "OprHr",
                "OPR_INTERVAL": "OprInterval",
                "MARKET_RUN_ID": "MarketRunId",
                "TAC_AREA_NAME": "TacAreaName",
                "LABEL": "Label",
                "XML_DATA_ITEM": "XmlDataItem",
                "POS": "Pos",
                "MW": "Load",
                "EXECUTION_TYPE": "ExecutionType",
                "GROUP": "Group",
            }
        )

        return df

    def _sanitize_data(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df[df["Label"].isin(self.load_types)]
        return df

    def _validate_options(self) -> bool:
        try:
            datetime.strptime(self.date, self.user_datetime_format)
        except ValueError:
            raise ValueError(
                f"Unable to parse date. Please specify in {self.user_datetime_format} format."
            )
        return True

CAISOHistoricalLoadISOSource

Bases: CAISODailyLoadISOSource

The CAISO Historical Load ISO Source is used to read load data for an interval of dates between start_date and end_date inclusive from CAISO API. It supports multiple types of data. Check the load_types attribute.

To read more about the available reports from CAISO API, download the file - Interface Specification

From the list of reports in the file, it pulls the report named CAISO Demand Forecast in the file.

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations (See Attributes table below)

required

Attributes:

Name Type Description
load_types list

Must be a subset of [Demand Forecast 7-Day Ahead, Demand Forecast 2-Day Ahead, Demand Forecast Day Ahead, RTM 15Min Load Forecast, RTM 5Min Load Forecast, Total Actual Hourly Integrated Load].
Default Value - [Total Actual Hourly Integrated Load].

start_date str

Must be in YYYY-MM-DD format.

end_date str

Must be in YYYY-MM-DD format.

Please check the BaseISOSource for available methods.

BaseISOSource

BaseISOSource

Bases: SourceInterface

Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations

required
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class BaseISOSource(SourceInterface):
    """
    Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations
    """

    spark: SparkSession
    options: dict
    iso_url: str = "https://"
    query_datetime_format: str = "%Y%m%d"
    required_options: list = []
    spark_schema = StructType([StructField("id", IntegerType(), True)])
    default_query_timezone: str = "UTC"

    def __init__(self, spark: SparkSession, options: dict) -> None:
        self.spark = spark
        self.options = options
        self.query_timezone = pytz.timezone(
            self.options.get("query_timezone", self.default_query_timezone)
        )
        self.current_date = datetime.now(timezone.utc).astimezone(self.query_timezone)

    def _fetch_from_url(self, url_suffix: str) -> bytes:
        """
        Gets data from external ISO API.

        Args:
            url_suffix: String to be used as suffix to iso url.

        Returns:
            Raw content of the data received.

        """
        url = f"{self.iso_url}{url_suffix}"
        logging.info(f"Requesting URL - {url}")

        response = requests.get(url)
        code = response.status_code

        if code != 200:
            raise HTTPError(
                f"Unable to access URL `{url}`."
                f" Received status code {code} with message {response.content}"
            )

        return response.content

    def _get_localized_datetime(self, datetime_str: str) -> datetime:
        """
        Converts string datetime into Python datetime object with configured format and timezone.
        Args:
            datetime_str: String to be converted into datetime.

        Returns: Timezone aware datetime object.

        """
        parsed_dt = datetime.strptime(datetime_str, self.query_datetime_format)
        parsed_dt = parsed_dt.replace(tzinfo=self.query_timezone)
        return parsed_dt

    def _pull_data(self) -> pd.DataFrame:
        """
        Hits the fetch_from_url method with certain parameters to get raw data from API.

        All the children ISO classes must override this method and call the fetch_url method
        in it.

        Returns:
             Raw DataFrame from API.
        """

        return pd.read_csv(BytesIO(self._fetch_from_url("")))

    def _prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Performs all the basic transformations to prepare data for further processing.
        All the children ISO classes must override this method.

        Args:
            df: Raw DataFrame, received from the API.

        Returns:
             Modified DataFrame, ready for basic use.

        """
        return df

    def _sanitize_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Another data transformation helper method to be called after prepare data.
        Used for advance data processing such as cleaning, filtering, restructuring.
        All the children ISO classes must override this method if there is any post-processing required.

        Args:
            df: Initial modified version of DataFrame, received after preparing the data.

        Returns:
             Final version of data after all the fixes and modifications.

        """
        return df

    def _get_data(self) -> pd.DataFrame:
        """
        Entrypoint method to return the final version of DataFrame.

        Returns:
            Modified form of data for specific use case.

        """
        df = self._pull_data()
        df = self._prepare_data(df)
        df = self._sanitize_data(df)

        # Reorder columns to keep the data consistent
        df = df[self.spark_schema.names]

        return df

    @staticmethod
    def system_type():
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def _validate_options(self) -> bool:
        """
        Performs all the options checks. Raises exception in case of any invalid value.
        Returns:
             True if all checks are passed.

        """
        return True

    def pre_read_validation(self) -> bool:
        """
        Ensures all the required options are provided and performs other validations.
        Returns:
             True if all checks are passed.

        """
        for key in self.required_options:
            if key not in self.options:
                raise ValueError(f"Required option `{key}` is missing.")

        return self._validate_options()

    def post_read_validation(self) -> bool:
        return True

    def read_batch(self) -> DataFrame:
        """
        Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
        Returns:
             Final Spark DataFrame converted from Pandas DataFrame post-execution.

        """

        try:
            self.pre_read_validation()
            pdf = self._get_data()
            pdf = _prepare_pandas_to_convert_to_spark(pdf)

            # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
            pd.DataFrame.iteritems = pd.DataFrame.items
            df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
            return df

        except Exception as e:
            logging.exception(str(e))
            raise e

    def read_stream(self) -> DataFrame:
        """
        By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

        Returns:
             Final Spark DataFrame after all the processing.

        """

        raise NotImplementedError(
            f"{self.__class__.__name__} connector doesn't support stream operation."
        )

pre_read_validation()

Ensures all the required options are provided and performs other validations. Returns: True if all checks are passed.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
175
176
177
178
179
180
181
182
183
184
185
186
def pre_read_validation(self) -> bool:
    """
    Ensures all the required options are provided and performs other validations.
    Returns:
         True if all checks are passed.

    """
    for key in self.required_options:
        if key not in self.options:
            raise ValueError(f"Required option `{key}` is missing.")

    return self._validate_options()

read_batch()

Spark entrypoint, It executes the entire process of pulling, transforming & fixing data. Returns: Final Spark DataFrame converted from Pandas DataFrame post-execution.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def read_batch(self) -> DataFrame:
    """
    Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
    Returns:
         Final Spark DataFrame converted from Pandas DataFrame post-execution.

    """

    try:
        self.pre_read_validation()
        pdf = self._get_data()
        pdf = _prepare_pandas_to_convert_to_spark(pdf)

        # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
        pd.DataFrame.iteritems = pd.DataFrame.items
        df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
        return df

    except Exception as e:
        logging.exception(str(e))
        raise e

read_stream()

By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

Returns:

Type Description
DataFrame

Final Spark DataFrame after all the processing.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py
213
214
215
216
217
218
219
220
221
222
223
224
def read_stream(self) -> DataFrame:
    """
    By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

    Returns:
         Final Spark DataFrame after all the processing.

    """

    raise NotImplementedError(
        f"{self.__class__.__name__} connector doesn't support stream operation."
    )
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/caiso_historical_load_iso.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
class CAISOHistoricalLoadISOSource(CAISODailyLoadISOSource):
    """
    The CAISO Historical Load ISO Source is used to read load data for an interval of dates
     between start_date and end_date inclusive from CAISO API.
    It supports multiple types of data. Check the `load_types` attribute.

    To read more about the available reports from CAISO API, download the file -
     [Interface Specification](https://www.caiso.com/Documents/OASISAPISpecification.pdf)

    From the list of reports in the file, it pulls the report named `CAISO Demand Forecast` in the file.

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations (See Attributes table below)

    Attributes:
        load_types (list): Must be a subset of [`Demand Forecast 7-Day Ahead`, `Demand Forecast 2-Day Ahead`, `Demand Forecast Day Ahead`, `RTM 15Min Load Forecast`, `RTM 5Min Load Forecast`, `Total Actual Hourly Integrated Load`]. <br> Default Value - `[Total Actual Hourly Integrated Load]`.
        start_date (str): Must be in `YYYY-MM-DD` format.
        end_date (str): Must be in `YYYY-MM-DD` format.

    Please check the BaseISOSource for available methods.

    BaseISOSource:
        ::: src.sdk.python.rtdip_sdk.pipelines.sources.spark.iso.base_iso
    """

    spark: SparkSession
    options: dict
    required_options = ["load_types", "start_date", "end_date"]

    def __init__(self, spark: SparkSession, options: dict) -> None:
        super().__init__(spark, options)
        self.spark = spark
        self.options = options
        self.load_types = self.options.get(
            "load_types", ["Total Actual Hourly Integrated Load"]
        )
        self.start_date = self.options.get("start_date", "").strip()
        self.end_date = self.options.get("end_date", "").strip()
        self.user_datetime_format = "%Y-%m-%d"

    def _pull_data(self) -> pd.DataFrame:
        """
        Pulls data from the CAISO API and parses the zip files for CSV data.

        Returns:
            Raw form of data.
        """

        logging.info(
            f"Getting {self.load_types} data from {self.start_date} to {self.end_date}"
        )
        start_date = datetime.strptime(self.start_date, self.user_datetime_format)
        end_date = datetime.strptime(self.end_date, self.user_datetime_format)
        end_date = end_date + timedelta(days=1)
        generated_days_ranges = []
        dates = pd.date_range(start_date, end_date, freq="30D", inclusive="left")

        for date in dates:
            py_date = date.to_pydatetime()
            date_last = py_date + timedelta(days=30)
            date_last = min(date_last, end_date)
            generated_days_ranges.append((py_date, date_last))

        logging.info(f"Generated date ranges are {generated_days_ranges}")

        dfs = []
        for idx, date_range in enumerate(generated_days_ranges):
            start_date_str, end_date_str = date_range
            df = self._fetch_and_parse_zip(start_date_str, end_date_str)

            dfs.append(df)

        return pd.concat(dfs)

    def _validate_options(self) -> bool:
        try:
            datetime.strptime(self.start_date, self.user_datetime_format)
        except ValueError:
            raise ValueError(
                f"Unable to parse start_date. Please specify in {self.user_datetime_format} format."
            )

        try:
            datetime.strptime(self.end_date, self.user_datetime_format)
        except ValueError:
            raise ValueError(
                f"Unable to parse end_date. Please specify in {self.user_datetime_format} format."
            )

        return True

SparkWeatherCompanyBaseWeatherSource

Bases: BaseISOSource

Base class for all the Weather related sources. Provides common functionality.

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of Weather Source specific configurations.

required
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/the_weather_company/base_weather.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
class SparkWeatherCompanyBaseWeatherSource(BaseISOSource):
    """
    Base class for all the Weather related sources. Provides common functionality.

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of Weather Source specific configurations.

    """

    spark: SparkSession
    options: dict
    weather_url: str = "https://"
    api_params: dict = {}

    def __init__(self, spark: SparkSession, options: dict) -> None:
        super(SparkWeatherCompanyBaseWeatherSource, self).__init__(spark, options)
        self.spark = spark
        self.options = options

    def _get_api_params(self) -> dict:
        return self.api_params

    def _fetch_weather_from_url(self, url_suffix: str, params: dict) -> bytes:
        """
        Gets data from external Weather Forecast API.

        Args:
            url_suffix: String to be used as suffix to weather url.

        Returns:
            Raw content of the data received.

        """
        url = f"{self.weather_url}{url_suffix}"
        logging.info(f"Requesting URL - `{url}` with params - {params}")

        response = requests.get(url, params)
        code = response.status_code

        if code != 200:
            raise HTTPError(
                f"Unable to access URL `{url}`."
                f" Received status code {code} with message {response.content}"
            )
        return response.content

    def _fetch_from_url(self, url_suffix: str) -> bytes:
        return self._fetch_weather_from_url(url_suffix, self._get_api_params())

SparkWeatherCompanyForecastAPIV1Source

Bases: SparkWeatherCompanyBaseWeatherSource

The Weather Forecast API V1 Source is used to read 15 days forecast from the Weather API.

URL: https://api.weather.com/v1/geocode/32.3667/-95.4/forecast/hourly/360hour.json

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations (See Attributes table below).

required

Attributes:

Name Type Description
lat str

Latitude of the Weather Station.

lon str

Longitude of the Weather Station.

api_key str

Weather API key.

language str

API response language. Defaults to en-US.

units str

Unit of measurements. Defaults to e.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/the_weather_company/weather_forecast_api_v1.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
class SparkWeatherCompanyForecastAPIV1Source(SparkWeatherCompanyBaseWeatherSource):
    """
    The Weather Forecast API V1 Source is used to read 15 days forecast from the Weather API.

    URL: <a href="https://api.weather.com/v1/geocode/32.3667/-95.4/forecast/hourly/360hour.json">
    https://api.weather.com/v1/geocode/32.3667/-95.4/forecast/hourly/360hour.json</a>

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations (See Attributes table below).

    Attributes:
        lat (str): Latitude of the Weather Station.
        lon (str): Longitude of the Weather Station.
        api_key (str): Weather API key.
        language (str): API response language. Defaults to `en-US`.
        units (str): Unit of measurements. Defaults to `e`.
    """

    spark: SparkSession
    spark_schema = WEATHER_FORECAST_SCHEMA
    options: dict
    weather_url: str = "https://api.weather.com/v1/geocode/"
    required_options = ["lat", "lon", "api_key"]

    def __init__(self, spark: SparkSession, options: dict) -> None:
        super(SparkWeatherCompanyForecastAPIV1Source, self).__init__(spark, options)
        self.spark = spark
        self.options = options
        self.lat = self.options.get("lat", "").strip()
        self.lon = self.options.get("lon", "").strip()
        self.api_key = self.options.get("api_key", "").strip()
        self.language = self.options.get("language", "en-US").strip()
        self.units = self.options.get("units", "e").strip()

    def _prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Prepares weather data for the use.

        Args:
            df: Data received after preparation.

        Returns:
            Final data after all the transformations.

        """

        rename_cols = {
            "latitude": "Latitude",
            "longitude": "Longitude",
            "class": "Class",
            "expire_time_gmt": "ExpireTimeGmt",
            "fcst_valid": "FcstValid",
            "fcst_valid_local": "FcstValidLocal",
            "num": "Num",
            "day_ind": "DayInd",
            "temp": "Temp",
            "dewpt": "Dewpt",
            "hi": "Hi",
            "wc": "Wc",
            "feels_like": "FeelsLike",
            "icon_extd": "IconExtd",
            "wxman": "Wxman",
            "icon_code": "IconCode",
            "dow": "Dow",
            "phrase_12char": "Phrase12Char",
            "phrase_22char": "Phrase22Char",
            "phrase_32char": "Phrase32Char",
            "subphrase_pt1": "SubphrasePt1",
            "subphrase_pt2": "SubphrasePt2",
            "subphrase_pt3": "SubphrasePt3",
            "pop": "Pop",
            "precip_type": "PrecipType",
            "qpf": "Qpf",
            "snow_qpf": "SnowQpf",
            "rh": "Rh",
            "wspd": "Wspd",
            "wdir": "Wdir",
            "wdir_cardinal": "WdirCardinal",
            "gust": "Gust",
            "clds": "Clds",
            "vis": "Vis",
            "mslp": "Mslp",
            "uv_index_raw": "UvIndexRaw",
            "uv_index": "UvIndex",
            "uv_warning": "UvWarning",
            "uv_desc": "UvDesc",
            "golf_index": "GolfIndex",
            "golf_category": "GolfCategory",
            "severity": "Severity",
        }

        df = df.rename(columns=rename_cols)

        fields = self.spark_schema.fields

        str_cols = list(
            map(
                lambda x: x.name,
                filter(lambda x: isinstance(x.dataType, StringType), fields),
            )
        )
        double_cols = list(
            map(
                lambda x: x.name,
                filter(lambda x: isinstance(x.dataType, DoubleType), fields),
            )
        )
        int_cols = list(
            map(
                lambda x: x.name,
                filter(lambda x: isinstance(x.dataType, IntegerType), fields),
            )
        )

        df[str_cols] = df[str_cols].astype(str)
        df[double_cols] = df[double_cols].astype(float)
        df[int_cols] = df[int_cols].astype(int)

        df.reset_index(inplace=True, drop=True)

        return df

    def _get_api_params(self):
        params = {
            "language": self.language,
            "units": self.units,
            "apiKey": self.api_key,
        }
        return params

    def _pull_for_weather_station(self, lat: str, lon: str) -> pd.DataFrame:
        response = json.loads(
            self._fetch_from_url(f"{lat}/{lon}/forecast/hourly/360hour.json").decode(
                "utf-8"
            )
        )
        return pd.DataFrame(response["forecasts"])

    def _pull_data(self) -> pd.DataFrame:
        """
        Pulls data from the Weather API and parses the JSON file.

        Returns:
            Raw form of data.
        """

        df = self._pull_for_weather_station(self.lat, self.lon)
        df["latitude"] = self.lat
        df["longitude"] = self.lon

        return df

SparkWeatherCompanyForecastAPIV1MultiSource

Bases: SparkWeatherCompanyForecastAPIV1Source

The Weather Forecast API V1 Multi Source is used to read 15 days forecast from the Weather API. It allows to pull weather data for multiple stations and returns all of them in a single DataFrame.

URL for one station: https://api.weather.com/v1/geocode/32.3667/-95.4/forecast/hourly/360hour.json

It takes a list of Weather Stations. Each station item must contain comma separated Latitude & Longitude.

Examples

["32.3667,-95.4", "51.52,-0.11"]

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
options dict

A dictionary of ISO Source specific configurations (See Attributes table below).

required

Attributes:

Name Type Description
stations list[str]

List of Weather Stations.

api_key str

Weather API key.

language str

API response language. Defaults to en-US.

units str

Unit of measurements. Defaults to e.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/the_weather_company/weather_forecast_api_v1_multi.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
class SparkWeatherCompanyForecastAPIV1MultiSource(
    SparkWeatherCompanyForecastAPIV1Source
):
    """
    The Weather Forecast API V1 Multi Source is used to read 15 days forecast from the Weather API. It allows to
    pull weather data for multiple stations and returns all of them in a single DataFrame.

    URL for one station: <a href="https://api.weather.com/v1/geocode/32.3667/-95.4/forecast/hourly/360hour.json">
    https://api.weather.com/v1/geocode/32.3667/-95.4/forecast/hourly/360hour.json</a>

    It takes a list of Weather Stations. Each station item must contain comma separated Latitude & Longitude.

    Examples
    --------
    `["32.3667,-95.4", "51.52,-0.11"]`

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations (See Attributes table below).

    Attributes:
        stations (list[str]): List of Weather Stations.
        api_key (str): Weather API key.
        language (str): API response language. Defaults to `en-US`.
        units (str): Unit of measurements. Defaults to `e`.
    """

    spark: SparkSession
    options: dict
    spark_schema = WEATHER_FORECAST_SCHEMA
    required_options = ["stations", "api_key"]

    def __init__(self, spark: SparkSession, options: dict) -> None:
        super(SparkWeatherCompanyForecastAPIV1MultiSource, self).__init__(
            spark, options
        )
        self.spark = spark
        self.options = options
        self.stations = self.options.get("stations", [])
        self.api_key = self.options.get("api_key", "").strip()
        self.language = self.options.get("language", "en-US").strip()
        self.units = self.options.get("units", "e").strip()

    def _pull_data(self) -> pd.DataFrame:
        """
        Pulls data from the Weather API and parses the JSON file for multiple stations

        Returns:
            Raw form of data.
        """

        result_df = None
        for station in self.stations:
            parts = station.split(",")
            lat, lon = parts

            df = self._pull_for_weather_station(lat, lon)
            df["latitude"] = lat
            df["longitude"] = lon

            if result_df is not None:
                result_df = pd.concat([result_df, df])
            else:
                result_df = df

        return result_df

    def _validate_options(self) -> bool:
        for station in self.stations:
            parts = station.split(",")

            if len(parts) != 2 or parts[0].strip() == "" or parts[1].strip() == "":
                raise ValueError(
                    f"Each station item must contain comma separated Latitude & Longitude. Eg: 10.23:45.2"
                )

        return True

PythonDeltaSource

Bases: SourceInterface

The Python Delta Source is used to read data from a Delta table without using Apache Spark, returning a Polars LazyFrame.

Example

from rtdip_sdk.pipelines.sources import PythonDeltaSource

path = "abfss://{FILE-SYSTEM}@{ACCOUNT-NAME}.dfs.core.windows.net/{PATH}/{FILE-NAME}

python_delta_source = PythonDeltaSource(
    path=path,
    version=None,
    storage_options={
        "azure_storage_account_name": "{AZURE-STORAGE-ACCOUNT-NAME}",
        "azure_storage_account_key": "{AZURE-STORAGE-ACCOUNT-KEY}"
    },
    pyarrow_options=None,
    without_files=False
)

python_delta_source.read_batch()
from rtdip_sdk.pipelines.sources import PythonDeltaSource

path = "https://s3.{REGION-CODE}.amazonaws.com/{BUCKET-NAME}/{KEY-NAME}"

python_delta_source = PythonDeltaSource(
    path=path,
    version=None,
    storage_options={
        "aws_access_key_id": "{AWS-ACCESS-KEY-ID}",
        "aws_secret_access_key": "{AWS-SECRET-ACCESS-KEY}"
    },
    pyarrow_options=None,
    without_files=False
)

python_delta_source.read_batch()

Parameters:

Name Type Description Default
path str

Path to the Delta table. Can be local or in S3/Azure storage

required
version optional int

Specify the Delta table version to read from. Defaults to the latest version

None
storage_options optional dict

Used to read from AWS/Azure storage. For AWS use format {"aws_access_key_id": "<>", "aws_secret_access_key":"<>"}. For Azure use format {"azure_storage_account_name": "<>", "azure_storage_account_key": "<>"}.

None
pyarrow_options optional dict

Data Access and Efficiency options when reading from Delta. See to_pyarrow_dataset.

None
without_files optional bool

If True loads the table without tracking files

False
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/python/delta.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
class PythonDeltaSource(SourceInterface):
    """
    The Python Delta Source is used to read data from a Delta table without using Apache Spark, returning a Polars LazyFrame.

     Example
    --------
    === "Azure"

        ```python
        from rtdip_sdk.pipelines.sources import PythonDeltaSource

        path = "abfss://{FILE-SYSTEM}@{ACCOUNT-NAME}.dfs.core.windows.net/{PATH}/{FILE-NAME}

        python_delta_source = PythonDeltaSource(
            path=path,
            version=None,
            storage_options={
                "azure_storage_account_name": "{AZURE-STORAGE-ACCOUNT-NAME}",
                "azure_storage_account_key": "{AZURE-STORAGE-ACCOUNT-KEY}"
            },
            pyarrow_options=None,
            without_files=False
        )

        python_delta_source.read_batch()
        ```
    === "AWS"

        ```python
        from rtdip_sdk.pipelines.sources import PythonDeltaSource

        path = "https://s3.{REGION-CODE}.amazonaws.com/{BUCKET-NAME}/{KEY-NAME}"

        python_delta_source = PythonDeltaSource(
            path=path,
            version=None,
            storage_options={
                "aws_access_key_id": "{AWS-ACCESS-KEY-ID}",
                "aws_secret_access_key": "{AWS-SECRET-ACCESS-KEY}"
            },
            pyarrow_options=None,
            without_files=False
        )

        python_delta_source.read_batch()
        ```

    Parameters:
        path (str): Path to the Delta table. Can be local or in S3/Azure storage
        version (optional int): Specify the Delta table version to read from. Defaults to the latest version
        storage_options (optional dict): Used to read from AWS/Azure storage. For AWS use format {"aws_access_key_id": "<>", "aws_secret_access_key":"<>"}. For Azure use format {"azure_storage_account_name": "<>", "azure_storage_account_key": "<>"}.
        pyarrow_options (optional dict): Data Access and Efficiency options when reading from Delta. See [to_pyarrow_dataset](https://delta-io.github.io/delta-rs/python/api_reference.html#deltalake.table.DeltaTable.to_pyarrow_dataset){ target="_blank" }.
        without_files (optional bool): If True loads the table without tracking files
    """

    path: str
    version: int
    storage_options: dict
    pyarrow_options: dict
    without_files: bool

    def __init__(
        self,
        path: str,
        version: int = None,
        storage_options: dict = None,
        pyarrow_options: dict = None,
        without_files: bool = False,
    ):
        self.path = path
        self.version = version
        self.storage_options = storage_options
        self.pyarrow_options = pyarrow_options
        self.without_files = without_files

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYTHON
        """
        return SystemType.PYTHON

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_read_validation(self):
        return True

    def post_read_validation(self):
        return True

    def read_batch(self) -> LazyFrame:
        """
        Reads data from a Delta table into a Polars LazyFrame
        """
        without_files_dict = {"without_files": self.without_files}
        lf = pl.scan_delta(
            source=self.path,
            version=self.version,
            storage_options=self.storage_options,
            delta_table_options=without_files_dict,
            pyarrow_options=self.pyarrow_options,
        )
        return lf

    def read_stream(self):
        """
        Raises:
            NotImplementedError: Reading from a Delta table using Python is only possible for batch reads. To perform a streaming read, use the read_stream method of the SparkDeltaSource component.
        """
        raise NotImplementedError(
            "Reading from a Delta table using Python is only possible for batch reads. To perform a streaming read, use the read_stream method of the SparkDeltaSource component"
        )

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYTHON

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/python/delta.py
 97
 98
 99
100
101
102
103
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYTHON
    """
    return SystemType.PYTHON

read_batch()

Reads data from a Delta table into a Polars LazyFrame

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/python/delta.py
120
121
122
123
124
125
126
127
128
129
130
131
132
def read_batch(self) -> LazyFrame:
    """
    Reads data from a Delta table into a Polars LazyFrame
    """
    without_files_dict = {"without_files": self.without_files}
    lf = pl.scan_delta(
        source=self.path,
        version=self.version,
        storage_options=self.storage_options,
        delta_table_options=without_files_dict,
        pyarrow_options=self.pyarrow_options,
    )
    return lf

read_stream()

Raises:

Type Description
NotImplementedError

Reading from a Delta table using Python is only possible for batch reads. To perform a streaming read, use the read_stream method of the SparkDeltaSource component.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/python/delta.py
134
135
136
137
138
139
140
141
def read_stream(self):
    """
    Raises:
        NotImplementedError: Reading from a Delta table using Python is only possible for batch reads. To perform a streaming read, use the read_stream method of the SparkDeltaSource component.
    """
    raise NotImplementedError(
        "Reading from a Delta table using Python is only possible for batch reads. To perform a streaming read, use the read_stream method of the SparkDeltaSource component"
    )

PythonDeltaSharingSource

Bases: SourceInterface

The Python Delta Sharing Source is used to read data from a Delta table with Delta Sharing configured, without using Apache Spark.

Example

from rtdip_sdk.pipelines.sources import PythonDeltaSharingSource

python_delta_sharing_source = PythonDeltaSharingSource(
    profile_path="{CREDENTIAL-FILE-LOCATION}",
    share_name="{SHARE-NAME}",
    schema_name="{SCHEMA-NAME}",
    table_name="{TABLE-NAME}"
)

python_delta_sharing_source.read_batch()

Parameters:

Name Type Description Default
profile_path str

Location of the credential file. Can be any URL supported by FSSPEC

required
share_name str

The value of 'share=' for the table

required
schema_name str

The value of 'schema=' for the table

required
table_name str

The value of 'name=' for the table

required
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/python/delta_sharing.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
class PythonDeltaSharingSource(SourceInterface):
    """
    The Python Delta Sharing Source is used to read data from a Delta table with Delta Sharing configured, without using Apache Spark.

    Example
    -------
    ```python
    from rtdip_sdk.pipelines.sources import PythonDeltaSharingSource

    python_delta_sharing_source = PythonDeltaSharingSource(
        profile_path="{CREDENTIAL-FILE-LOCATION}",
        share_name="{SHARE-NAME}",
        schema_name="{SCHEMA-NAME}",
        table_name="{TABLE-NAME}"
    )

    python_delta_sharing_source.read_batch()
    ```

    Parameters:
        profile_path (str): Location of the credential file. Can be any URL supported by [FSSPEC](https://filesystem-spec.readthedocs.io/en/latest/index.html){ target="_blank" }
        share_name (str): The value of 'share=' for the table
        schema_name (str): The value of 'schema=' for the table
        table_name (str): The value of 'name=' for the table
    """

    profile_path: str
    share_name: str
    schema_name: str
    table_name: str

    def __init__(
        self, profile_path: str, share_name: str, schema_name: str, table_name: str
    ):
        self.profile_path = profile_path
        self.share_name = share_name
        self.schema_name = schema_name
        self.table_name = table_name

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYTHON
        """
        return SystemType.PYTHON

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_read_validation(self):
        return True

    def post_read_validation(self):
        return True

    def read_batch(self) -> LazyFrame:
        """
        Reads data from a Delta table with Delta Sharing into a Polars LazyFrame.
        """
        pandas_df = delta_sharing.load_as_pandas(
            f"{self.profile_path}#{self.share_name}.{self.schema_name}.{self.table_name}"
        )
        polars_lazyframe = pl.from_pandas(pandas_df).lazy()
        return polars_lazyframe

    def read_stream(self):
        """
        Raises:
            NotImplementedError: Reading from a Delta table with Delta Sharing using Python is only possible for batch reads.
        """
        raise NotImplementedError(
            "Reading from a Delta table with Delta Sharing using Python is only possible for batch reads."
        )

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYTHON

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/python/delta_sharing.py
62
63
64
65
66
67
68
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYTHON
    """
    return SystemType.PYTHON

read_batch()

Reads data from a Delta table with Delta Sharing into a Polars LazyFrame.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/python/delta_sharing.py
85
86
87
88
89
90
91
92
93
def read_batch(self) -> LazyFrame:
    """
    Reads data from a Delta table with Delta Sharing into a Polars LazyFrame.
    """
    pandas_df = delta_sharing.load_as_pandas(
        f"{self.profile_path}#{self.share_name}.{self.schema_name}.{self.table_name}"
    )
    polars_lazyframe = pl.from_pandas(pandas_df).lazy()
    return polars_lazyframe

read_stream()

Raises:

Type Description
NotImplementedError

Reading from a Delta table with Delta Sharing using Python is only possible for batch reads.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/python/delta_sharing.py
 95
 96
 97
 98
 99
100
101
102
def read_stream(self):
    """
    Raises:
        NotImplementedError: Reading from a Delta table with Delta Sharing using Python is only possible for batch reads.
    """
    raise NotImplementedError(
        "Reading from a Delta table with Delta Sharing using Python is only possible for batch reads."
    )

SparkECMWFBaseMarsSource

Download nc files from ECMWF MARS server using the ECMWF python API. Data is downloaded in parallel using joblib from ECMWF MARS server using the ECMWF python API.

Parameters:

Name Type Description Default
save_path str

Path to local directory where the nc files will be stored, in format "yyyy-mm-dd_HH.nc"

required
date_start str

Start date of extraction in "YYYY-MM-DD HH:MM:SS" format

required
date_end str

End date of extraction in "YYYY-MM-DD HH:MM:SS" format

required
ecmwf_api_key str

API key for ECMWF MARS server

required
ecmwf_api_email str

Email for ECMWF MARS server

required
ecmwf_api_url str

URL for ECMWF MARS server

'https://api.ecmwf.int/v1'
run_frequency str

Frequency format of runs to download, e.g. "H"

'H'
run_interval str

Interval of runs, e.g. a run_frequency of "H" and run_interval of "12" will extract the data of the 00 and 12 run for each day.

'12'
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/ecmwf/base_mars.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
class SparkECMWFBaseMarsSource:
    """
    Download nc files from ECMWF MARS server using the ECMWF python API.
    Data is downloaded in parallel using joblib from ECMWF MARS server using the ECMWF python API.

    Parameters:
        save_path (str): Path to local directory where the nc files will be stored, in format "yyyy-mm-dd_HH.nc"
        date_start (str): Start date of extraction in "YYYY-MM-DD HH:MM:SS" format
        date_end (str): End date of extraction in "YYYY-MM-DD HH:MM:SS" format
        ecmwf_api_key (str): API key for ECMWF MARS server
        ecmwf_api_email (str): Email for ECMWF MARS server
        ecmwf_api_url (str): URL for ECMWF MARS server
        run_frequency (str):Frequency format of runs to download, e.g. "H"
        run_interval (str): Interval of runs, e.g. a run_frequency of "H" and run_interval of "12" will extract the data of the 00 and 12 run for each day.
    """

    def __init__(
        self,
        date_start: str,
        date_end: str,
        save_path: str,
        ecmwf_api_key: str,
        ecmwf_api_email: str,
        ecmwf_api_url: str = "https://api.ecmwf.int/v1",
        run_interval: str = "12",
        run_frequency: str = "H",
    ):
        self.retrieve_ran = False
        self.date_start = date_start
        self.date_end = date_end
        self.save_path = save_path
        self.format = format
        self.run_interval = run_interval
        self.run_frequency = run_frequency
        self.ecmwf_api_key = ecmwf_api_key
        self.ecmwf_api_url = ecmwf_api_url
        self.ecmwf_api_email = ecmwf_api_email

        # Pandas date_list (info best retrieved per forecast day)
        self.dates = pd.date_range(
            start=date_start, end=date_end, freq=run_interval + run_frequency
        )

    def retrieve(
        self,
        mars_dict: dict,
        n_jobs=None,
        backend="loky",
        tries=5,
        cost=False,
    ):
        """Retrieve the data from the server.

        Function will use the ecmwf api to download the data from the server.
        Note that mars has a max of two active requests per user and 20 queued
        requests.
        Data is downloaded in parallel using joblib from ECMWF MARS server using the ECMWF python API.


        Parameters:
            mars_dict (dict): Dictionary of mars parameters.
            n_jobs (int, optional): Download in parallel? by default None, i.e. no parallelization
            backend (str, optional) : Specify the parallelization backend implementation in joblib, by default "loky"
            tries (int, optional): Number of tries for each request if it fails, by default 5
            cost (bool, optional):  Pass a cost request to mars to estimate the size and efficiency of your request,
                but not actually download the data. Can be useful for defining requests,
                by default False.
        """
        chk = ["date", "target", "time", "format", "output"]
        for i in chk:
            if i in mars_dict.keys():
                raise ValueError(f"don't include {i} in the mars_dict")

        parallel = Parallel(n_jobs=n_jobs, backend=backend)

        def _retrieve_datetime(i, j, cost=cost):
            i_dict = {"date": i, "time": j}

            if cost:
                filename = f"{i}_{j}.txt"  # NOSONAR
            else:
                filename = f"{i}_{j}.nc"
                i_dict["format"] = "netcdf"  # NOSONAR

            target = os.path.join(self.save_path, filename)
            msg = f"retrieving mars data --- {filename}"

            req_dict = {**i_dict, **mars_dict}
            for k, v in req_dict.items():
                if isinstance(v, (list, tuple)):
                    req_dict[k] = "/".join([str(x) for x in v])  # NOSONAR

            req_dict = ["{}={}".format(k, v) for k, v in req_dict.items()]
            if cost:
                req_dict = "list,output=cost,{}".format(",".join(req_dict))  # NOSONAR
            else:
                req_dict = "retrieve,{}".format(",".join(req_dict))  # NOSONAR

            for j in range(tries):
                try:
                    print(msg)
                    server = ECMWFService(
                        "mars",
                        url=self.ecmwf_api_url,
                        email=self.ecmwf_api_email,
                        key=self.ecmwf_api_key,
                    )
                    server.execute(req_dict, target)
                    return 1  # NOSONAR
                except:  # NOSONAR
                    if j < tries - 1:
                        continue  # NOSONAR
                    else:
                        return 0  # NOSONAR

        self.success = parallel(
            delayed(_retrieve_datetime)(str(k.date()), f"{k.hour:02}")
            for k in self.dates
        )
        self.retrieve_ran = True

        return self

    def info(self) -> pd.Series:
        """
        Return info on each ECMWF request.

        Returns:
            pd.Series: Successful request for each run == 1.
        """
        if not self.retrieve_ran:
            raise ValueError(
                "Before using self.info(), prepare the request using "
                + "self.retrieve()"
            )
        y = pd.Series(self.success, index=self.dates, name="success", dtype=bool)

        return y

retrieve(mars_dict, n_jobs=None, backend='loky', tries=5, cost=False)

Retrieve the data from the server.

Function will use the ecmwf api to download the data from the server. Note that mars has a max of two active requests per user and 20 queued requests. Data is downloaded in parallel using joblib from ECMWF MARS server using the ECMWF python API.

Parameters:

Name Type Description Default
mars_dict dict

Dictionary of mars parameters.

required
n_jobs int

Download in parallel? by default None, i.e. no parallelization

None
backend str, optional)

Specify the parallelization backend implementation in joblib, by default "loky"

'loky'
tries int

Number of tries for each request if it fails, by default 5

5
cost bool

Pass a cost request to mars to estimate the size and efficiency of your request, but not actually download the data. Can be useful for defining requests, by default False.

False
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/ecmwf/base_mars.py
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def retrieve(
    self,
    mars_dict: dict,
    n_jobs=None,
    backend="loky",
    tries=5,
    cost=False,
):
    """Retrieve the data from the server.

    Function will use the ecmwf api to download the data from the server.
    Note that mars has a max of two active requests per user and 20 queued
    requests.
    Data is downloaded in parallel using joblib from ECMWF MARS server using the ECMWF python API.


    Parameters:
        mars_dict (dict): Dictionary of mars parameters.
        n_jobs (int, optional): Download in parallel? by default None, i.e. no parallelization
        backend (str, optional) : Specify the parallelization backend implementation in joblib, by default "loky"
        tries (int, optional): Number of tries for each request if it fails, by default 5
        cost (bool, optional):  Pass a cost request to mars to estimate the size and efficiency of your request,
            but not actually download the data. Can be useful for defining requests,
            by default False.
    """
    chk = ["date", "target", "time", "format", "output"]
    for i in chk:
        if i in mars_dict.keys():
            raise ValueError(f"don't include {i} in the mars_dict")

    parallel = Parallel(n_jobs=n_jobs, backend=backend)

    def _retrieve_datetime(i, j, cost=cost):
        i_dict = {"date": i, "time": j}

        if cost:
            filename = f"{i}_{j}.txt"  # NOSONAR
        else:
            filename = f"{i}_{j}.nc"
            i_dict["format"] = "netcdf"  # NOSONAR

        target = os.path.join(self.save_path, filename)
        msg = f"retrieving mars data --- {filename}"

        req_dict = {**i_dict, **mars_dict}
        for k, v in req_dict.items():
            if isinstance(v, (list, tuple)):
                req_dict[k] = "/".join([str(x) for x in v])  # NOSONAR

        req_dict = ["{}={}".format(k, v) for k, v in req_dict.items()]
        if cost:
            req_dict = "list,output=cost,{}".format(",".join(req_dict))  # NOSONAR
        else:
            req_dict = "retrieve,{}".format(",".join(req_dict))  # NOSONAR

        for j in range(tries):
            try:
                print(msg)
                server = ECMWFService(
                    "mars",
                    url=self.ecmwf_api_url,
                    email=self.ecmwf_api_email,
                    key=self.ecmwf_api_key,
                )
                server.execute(req_dict, target)
                return 1  # NOSONAR
            except:  # NOSONAR
                if j < tries - 1:
                    continue  # NOSONAR
                else:
                    return 0  # NOSONAR

    self.success = parallel(
        delayed(_retrieve_datetime)(str(k.date()), f"{k.hour:02}")
        for k in self.dates
    )
    self.retrieve_ran = True

    return self

info()

Return info on each ECMWF request.

Returns:

Type Description
Series

pd.Series: Successful request for each run == 1.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/ecmwf/base_mars.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def info(self) -> pd.Series:
    """
    Return info on each ECMWF request.

    Returns:
        pd.Series: Successful request for each run == 1.
    """
    if not self.retrieve_ran:
        raise ValueError(
            "Before using self.info(), prepare the request using "
            + "self.retrieve()"
        )
    y = pd.Series(self.success, index=self.dates, name="success", dtype=bool)

    return y

SparkECMWFWeatherForecastSource

Bases: SourceInterface

The Weather Forecast API V1 Source class to doownload nc files from ECMWF MARS server using the ECMWF python API.

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance

required
save_path str

Path to local directory where the nc files will be stored, in format "yyyy-mm-dd_HH.nc"

required
date_start str

Start date of extraction in "YYYY-MM-DD HH:MM:SS" format date_end:str,

required
date_end str

End date of extraction in "YYYY-MM-DD HH:MM:SS" format

required
ecmwf_class str

ecmwf classification of data

required
stream str

Operational model stream

required
expver str

Version of data

required
leveltype str

Surface level forecasts

required
ec_vars list

Variables of forecast measurements.

required
forecast_area list

N/W/S/E coordinates of the forecast area

required
ecmwf_api_key str

API key for ECMWF API

required
ecmwf_api_email str

Email for ECMWF API

required
Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/ecmwf/weather_forecast.py
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
class SparkECMWFWeatherForecastSource(SourceInterface):
    """
    The Weather Forecast API V1 Source class to doownload nc files from ECMWF MARS server using the ECMWF python API.

    Parameters:
        spark (SparkSession): Spark Session instance
        save_path (str): Path to local directory where the nc files will be stored, in format "yyyy-mm-dd_HH.nc"
        date_start (str): Start date of extraction in "YYYY-MM-DD HH:MM:SS" format    date_end:str,
        date_end (str): End date of extraction in "YYYY-MM-DD HH:MM:SS" format
        ecmwf_class (str): ecmwf classification of data
        stream (str): Operational model stream
        expver (str): Version of data
        leveltype (str): Surface level forecasts
        ec_vars (list): Variables of forecast measurements.
        forecast_area (list): N/W/S/E coordinates of the forecast area
        ecmwf_api_key (str): API key for ECMWF API
        ecmwf_api_email (str): Email for ECMWF API
    """

    spark: SparkSession

    def __init__(
        self,
        spark: SparkSession,
        save_path: str,
        date_start: str,
        date_end: str,
        ecmwf_class: str,
        stream: str,
        expver: str,
        leveltype: str,
        ec_vars: list,
        forecast_area: list,
        ecmwf_api_key: str,
        ecmwf_api_email: str,
    ) -> None:
        self.spark = spark
        self.save_path = save_path
        self.date_start = date_start
        self.date_end = date_end
        self.ecmwf_class = ecmwf_class
        self.stream = stream  # operational model
        self.expver = expver  # experiment version of data
        self.leveltype = leveltype  # surface level forecasts
        self.ec_vars = ec_vars  # variables
        self.forecast_area = forecast_area  # N/W/S/E
        self.ecmwf_api_key = ecmwf_api_key
        self.ecmwf_api_email = ecmwf_api_email

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_read_validation(self):
        return True

    def post_read_validation(self):
        return True

    def read_stream(self):
        return True

    @classmethod
    def _get_lead_time(cls):
        """
        Lead time for the forecast data.
        90 hours - 1 Hour Interval
        90-146 - 3 Hour interval
        146 -246 - 6 Hour interval

        Returns:
            lead_times: Lead times in an array format.
        """
        lead_times = [*range(91), *range(93, 146, 3), *range(150, 246, 6)]
        np.array(lead_times)

        return lead_times

    def _get_api_params(self, lead_times):
        """
        API parameters for the forecast data.

        Returns:
            params (dict): API parameters for the forecast data.
        """

        params = {
            "class": self.ecmwf_class,  # ecmwf classification of data
            "stream": self.stream,  # operational model
            "expver": self.expver,  # experiment version of data
            "levtype": self.leveltype,  # surface level forecasts
            "type": "fc",  # forecasts
            "param": self.ec_vars,  # variables
            "step": lead_times,  # which lead times to download
            "area": self.forecast_area,  # N/W/S/E
            "grid": [0.1, 0.1],  # grid res of output
        }

        return params

    def read_batch(self):
        """
        Pulls data from the Weather API and returns as .nc files.

        """
        lead_times = self._get_lead_time()
        para = self._get_api_params(lead_times=lead_times)

        ec_conn = SparkECMWFBaseMarsSource(
            date_start=self.date_start,
            date_end=self.date_end,
            save_path=self.save_path,
            run_interval="12",
            run_frequency="H",
            ecmwf_api_key=self.ecmwf_api_key,
            ecmwf_api_email=self.ecmwf_api_email,
            ecmwf_api_url="https://api.ecmwf.int/v1",
        )

        ec_conn.retrieve(
            mars_dict=para,
            tries=5,
            n_jobs=-1,  # maximum of 20 queued requests per user (only two allowed active)
        )

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/ecmwf/weather_forecast.py
75
76
77
78
79
80
81
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

read_batch()

Pulls data from the Weather API and returns as .nc files.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/ecmwf/weather_forecast.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def read_batch(self):
    """
    Pulls data from the Weather API and returns as .nc files.

    """
    lead_times = self._get_lead_time()
    para = self._get_api_params(lead_times=lead_times)

    ec_conn = SparkECMWFBaseMarsSource(
        date_start=self.date_start,
        date_end=self.date_end,
        save_path=self.save_path,
        run_interval="12",
        run_frequency="H",
        ecmwf_api_key=self.ecmwf_api_key,
        ecmwf_api_email=self.ecmwf_api_email,
        ecmwf_api_url="https://api.ecmwf.int/v1",
    )

    ec_conn.retrieve(
        mars_dict=para,
        tries=5,
        n_jobs=-1,  # maximum of 20 queued requests per user (only two allowed active)
    )

SparkDeltaSource

Bases: SourceInterface

The Spark Delta Source is used to read data from a Delta table.

Example

#Delta Source for Streaming Queries

from rtdip_sdk.pipelines.sources import SparkDeltaSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

delta_source = SparkDeltaSource(
    spark=spark,
    options={
        "maxFilesPerTrigger": 1000,
        "ignoreChanges: True,
        "startingVersion": 0
    },
    table_name="{YOUR-DELTA-TABLE-PATH}"
)

delta_source.read_stream()
#Delta Source for Batch Queries

from rtdip_sdk.pipelines.sources import SparkDeltaSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

delta_source = SparkDeltaSource(
    spark=spark,
    options={
        "versionAsOf": 0,
        "timestampAsOf": "yyyy-mm-dd hh:mm:ss[.fffffffff]"
    },
    table_name="{YOUR-DELTA-TABLE-PATH}"
)

delta_source.read_batch()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session required to read data from a Delta table.

required
options dict

Options that can be specified for a Delta Table read operation (See Attributes table below). Further information on the options is available for batch and streaming.

required
table_name str

Name of the Hive Metastore or Unity Catalog Delta Table

required

Attributes:

Name Type Description
maxFilesPerTrigger int

How many new files to be considered in every micro-batch. The default is 1000. (Streaming)

maxBytesPerTrigger int

How much data gets processed in each micro-batch. (Streaming)

ignoreDeletes bool str

Ignore transactions that delete data at partition boundaries. (Streaming)

ignoreChanges bool str

Pre-process updates if files had to be rewritten in the source table due to a data changing operation. (Streaming)

startingVersion int str

The Delta Lake version to start from. (Streaming)

startingTimestamp datetime str

The timestamp to start from. (Streaming)

withEventTimeOrder bool str

Whether the initial snapshot should be processed with event time order. (Streaming)

timestampAsOf datetime str

Query the Delta Table from a specific point in time. (Batch)

versionAsOf int str

Query the Delta Table from a specific version. (Batch)

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/delta.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
class SparkDeltaSource(SourceInterface):
    """
    The Spark Delta Source is used to read data from a Delta table.

    Example
    --------
    ```python
    #Delta Source for Streaming Queries

    from rtdip_sdk.pipelines.sources import SparkDeltaSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    delta_source = SparkDeltaSource(
        spark=spark,
        options={
            "maxFilesPerTrigger": 1000,
            "ignoreChanges: True,
            "startingVersion": 0
        },
        table_name="{YOUR-DELTA-TABLE-PATH}"
    )

    delta_source.read_stream()
    ```
    ```python
    #Delta Source for Batch Queries

    from rtdip_sdk.pipelines.sources import SparkDeltaSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    delta_source = SparkDeltaSource(
        spark=spark,
        options={
            "versionAsOf": 0,
            "timestampAsOf": "yyyy-mm-dd hh:mm:ss[.fffffffff]"
        },
        table_name="{YOUR-DELTA-TABLE-PATH}"
    )

    delta_source.read_batch()
    ```

    Parameters:
        spark (SparkSession): Spark Session required to read data from a Delta table.
        options (dict): Options that can be specified for a Delta Table read operation (See Attributes table below). Further information on the options is available for [batch](https://docs.delta.io/latest/delta-batch.html#read-a-table){ target="_blank" } and [streaming](https://docs.delta.io/latest/delta-streaming.html#delta-table-as-a-source){ target="_blank" }.
        table_name (str): Name of the Hive Metastore or Unity Catalog Delta Table

    Attributes:
        maxFilesPerTrigger (int): How many new files to be considered in every micro-batch. The default is 1000. (Streaming)
        maxBytesPerTrigger (int): How much data gets processed in each micro-batch. (Streaming)
        ignoreDeletes (bool str): Ignore transactions that delete data at partition boundaries. (Streaming)
        ignoreChanges (bool str): Pre-process updates if files had to be rewritten in the source table due to a data changing operation. (Streaming)
        startingVersion (int str): The Delta Lake version to start from. (Streaming)
        startingTimestamp (datetime str): The timestamp to start from. (Streaming)
        withEventTimeOrder (bool str): Whether the initial snapshot should be processed with event time order. (Streaming)
        timestampAsOf (datetime str): Query the Delta Table from a specific point in time. (Batch)
        versionAsOf (int str): Query the Delta Table from a specific version. (Batch)
    """

    spark: SparkSession
    options: dict
    table_name: str

    def __init__(self, spark: SparkSession, options: dict, table_name: str) -> None:
        self.spark = spark
        self.options = options
        self.table_name = table_name

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        libraries.add_maven_library(get_default_package("spark_delta_core"))
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_read_validation(self):
        return True

    def post_read_validation(self):
        return True

    def read_batch(self):
        """
        Reads batch data from Delta. Most of the options provided by the Apache Spark DataFrame read API are supported for performing batch reads on Delta tables.
        """
        try:
            return (
                self.spark.read.format("delta")
                .options(**self.options)
                .table(self.table_name)
            )

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

    def read_stream(self) -> DataFrame:
        """
        Reads streaming data from Delta. All of the data in the table is processed as well as any new data that arrives after the stream started. .load() can take table name or path.
        """
        try:
            return (
                self.spark.readStream.format("delta")
                .options(**self.options)
                .load(self.table_name)
            )

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/delta.py
 98
 99
100
101
102
103
104
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

read_batch()

Reads batch data from Delta. Most of the options provided by the Apache Spark DataFrame read API are supported for performing batch reads on Delta tables.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/delta.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def read_batch(self):
    """
    Reads batch data from Delta. Most of the options provided by the Apache Spark DataFrame read API are supported for performing batch reads on Delta tables.
    """
    try:
        return (
            self.spark.read.format("delta")
            .options(**self.options)
            .table(self.table_name)
        )

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

read_stream()

Reads streaming data from Delta. All of the data in the table is processed as well as any new data that arrives after the stream started. .load() can take table name or path.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/delta.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
def read_stream(self) -> DataFrame:
    """
    Reads streaming data from Delta. All of the data in the table is processed as well as any new data that arrives after the stream started. .load() can take table name or path.
    """
    try:
        return (
            self.spark.readStream.format("delta")
            .options(**self.options)
            .load(self.table_name)
        )

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

BinaryToStringTransformer

Bases: TransformerInterface

Converts a dataframe body column from a binary to a string.

Example

from rtdip_sdk.pipelines.transformers import BinaryToStringTransformer

binary_to_string_transformer = BinaryToStringTransformer(
    data=df,
    souce_column_name="body",
    target_column_name="body"
)

result = binary_to_string_transformer.transform()

Parameters:

Name Type Description Default
data DataFrame

Dataframe to be transformed

required
source_column_name str

Spark Dataframe column containing the Binary data

required
target_column_name str

Spark Dataframe column name to be used for the String data

required
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/binary_to_string.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
class BinaryToStringTransformer(TransformerInterface):
    """
    Converts a dataframe body column from a binary to a string.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import BinaryToStringTransformer

    binary_to_string_transformer = BinaryToStringTransformer(
        data=df,
        souce_column_name="body",
        target_column_name="body"
    )

    result = binary_to_string_transformer.transform()
    ```

    Parameters:
        data (DataFrame): Dataframe to be transformed
        source_column_name (str): Spark Dataframe column containing the Binary data
        target_column_name (str): Spark Dataframe column name to be used for the String data
    """

    data: DataFrame
    source_column_name: str
    target_column_name: str

    def __init__(
        self, data: DataFrame, source_column_name: str, target_column_name: str
    ) -> None:
        self.data = data
        self.source_column_name = source_column_name
        self.target_column_name = target_column_name

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self):
        return True

    def transform(self) -> DataFrame:
        """
        Returns:
            DataFrame: A dataframe with the body column converted to string.
        """
        return self.data.withColumn(
            self.target_column_name, self.data[self.source_column_name].cast("string")
        )

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/binary_to_string.py
56
57
58
59
60
61
62
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

transform()

Returns:

Name Type Description
DataFrame DataFrame

A dataframe with the body column converted to string.

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/binary_to_string.py
79
80
81
82
83
84
85
86
def transform(self) -> DataFrame:
    """
    Returns:
        DataFrame: A dataframe with the body column converted to string.
    """
    return self.data.withColumn(
        self.target_column_name, self.data[self.source_column_name].cast("string")
    )

OPCPublisherOPCUAJsonToPCDMTransformer

Bases: TransformerInterface

Converts a Spark Dataframe column containing a json string created by OPC Publisher to the Process Control Data Model.

Example

from rtdip_sdk.pipelines.transformers import OPCPublisherOPCUAJsonToPCDMTransformer

opc_publisher_opcua_json_to_pcdm_transformer = OPCPublisherOPCUAJsonToPCDMTransformer(
    data=df,
    souce_column_name="body",
    multiple_rows_per_message=True,
    status_null_value="Good",
    change_type_value="insert",
    timestamp_formats=[
        "yyyy-MM-dd'T'HH:mm:ss.SSSX",
        "yyyy-MM-dd'T'HH:mm:ssX"
    ],
    filter=None
)

result = opc_publisher_opcua_json_to_pcdm_transformer.transform()

Parameters:

Name Type Description Default
data DataFrame

Dataframe containing the column with Json OPC UA data

required
source_column_name str

Spark Dataframe column containing the OPC Publisher Json OPC UA data

required
multiple_rows_per_message optional bool

Each Dataframe Row contains an array of/multiple OPC UA messages. The list of Json will be exploded into rows in the Dataframe.

True
status_null_value optional str

If populated, will replace null values in the Status column with the specified value.

None
change_type_value optional str

If populated, will replace 'insert' in the ChangeType column with the specified value.

'insert'
timestamp_formats optional list[str]

Specifies the timestamp formats to be used for converting the timestamp string to a Timestamp Type. For more information on formats, refer to this documentation.

["yyyy-MM-dd'T'HH:mm:ss.SSSX", "yyyy-MM-dd'T'HH:mm:ssX"]
filter optional str

Enables providing a filter to the data which can be required in certain scenarios. For example, it would be possible to filter on IoT Hub Device Id and Module by providing a filter in SQL format such as systemProperties.iothub-connection-device-id = "<Device Id>" AND systemProperties.iothub-connection-module-id = "<Module>"

None
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/opc_publisher_opcua_json_to_pcdm.py
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
class OPCPublisherOPCUAJsonToPCDMTransformer(TransformerInterface):
    """
    Converts a Spark Dataframe column containing a json string created by OPC Publisher to the Process Control Data Model.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import OPCPublisherOPCUAJsonToPCDMTransformer

    opc_publisher_opcua_json_to_pcdm_transformer = OPCPublisherOPCUAJsonToPCDMTransformer(
        data=df,
        souce_column_name="body",
        multiple_rows_per_message=True,
        status_null_value="Good",
        change_type_value="insert",
        timestamp_formats=[
            "yyyy-MM-dd'T'HH:mm:ss.SSSX",
            "yyyy-MM-dd'T'HH:mm:ssX"
        ],
        filter=None
    )

    result = opc_publisher_opcua_json_to_pcdm_transformer.transform()
    ```

    Parameters:
        data (DataFrame): Dataframe containing the column with Json OPC UA data
        source_column_name (str): Spark Dataframe column containing the OPC Publisher Json OPC UA data
        multiple_rows_per_message (optional bool): Each Dataframe Row contains an array of/multiple OPC UA messages. The list of Json will be exploded into rows in the Dataframe.
        status_null_value (optional str): If populated, will replace null values in the Status column with the specified value.
        change_type_value (optional str): If populated, will replace 'insert' in the ChangeType column with the specified value.
        timestamp_formats (optional list[str]): Specifies the timestamp formats to be used for converting the timestamp string to a Timestamp Type. For more information on formats, refer to this [documentation.](https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html)
        filter (optional str): Enables providing a filter to the data which can be required in certain scenarios. For example, it would be possible to filter on IoT Hub Device Id and Module by providing a filter in SQL format such as `systemProperties.iothub-connection-device-id = "<Device Id>" AND systemProperties.iothub-connection-module-id = "<Module>"`
    """

    data: DataFrame
    source_column_name: str
    multiple_rows_per_message: bool
    tagname_field: str
    status_null_value: str
    change_type_value: str
    timestamp_formats: list
    filter: str

    def __init__(
        self,
        data: DataFrame,
        source_column_name: str,
        multiple_rows_per_message: bool = True,
        tagname_field: str = "DisplayName",
        status_null_value: str = None,
        change_type_value: str = "insert",
        timestamp_formats: list = [
            "yyyy-MM-dd'T'HH:mm:ss.SSSX",
            "yyyy-MM-dd'T'HH:mm:ssX",
        ],
        filter: str = None,
    ) -> None:  # NOSONAR
        self.data = data
        self.source_column_name = source_column_name
        self.multiple_rows_per_message = multiple_rows_per_message
        self.tagname_field = tagname_field
        self.status_null_value = status_null_value
        self.change_type_value = change_type_value
        self.timestamp_formats = timestamp_formats
        self.filter = filter

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self):
        return True

    def transform(self) -> DataFrame:
        """
        Returns:
            DataFrame: A dataframe with the specified column converted to PCDM
        """
        if self.multiple_rows_per_message:
            df = self.data.withColumn(
                self.source_column_name,
                from_json(col(self.source_column_name), ArrayType(StringType())),
            ).withColumn(self.source_column_name, explode(self.source_column_name))
        else:
            df = self.data.withColumn(
                self.source_column_name,
                from_json(col(self.source_column_name), StringType()),
            )

        if self.filter != None:
            df = df.where(self.filter)

        df = (
            df.withColumn(
                "OPCUA", from_json(col(self.source_column_name), OPC_PUBLISHER_SCHEMA)
            )
            .withColumn("TagName", (col("OPCUA.{}".format(self.tagname_field))))
            .withColumn(
                "EventTime",
                coalesce(
                    *[
                        to_timestamp(col("OPCUA.Value.SourceTimestamp"), f)
                        for f in self.timestamp_formats
                    ]
                ),
            )
            .withColumn("Value", col("OPCUA.Value.Value"))
            .withColumn(
                "ValueType",
                when(col("Value").cast("float").isNotNull(), "float")
                .when(col("Value").cast("float").isNull(), "string")
                .otherwise("unknown"),
            )
            .withColumn("ChangeType", lit(self.change_type_value))
        )

        status_col_name = "OPCUA.Value.StatusCode.Symbol"
        if self.status_null_value != None:
            df = df.withColumn(
                "Status",
                when(col(status_col_name).isNotNull(), col(status_col_name)).otherwise(
                    lit(self.status_null_value)
                ),
            )
        else:
            df = df.withColumn("Status", col(status_col_name))

        return df.select(
            "TagName", "EventTime", "Status", "Value", "ValueType", "ChangeType"
        )

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/opc_publisher_opcua_json_to_pcdm.py
 99
100
101
102
103
104
105
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

transform()

Returns:

Name Type Description
DataFrame DataFrame

A dataframe with the specified column converted to PCDM

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/opc_publisher_opcua_json_to_pcdm.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
def transform(self) -> DataFrame:
    """
    Returns:
        DataFrame: A dataframe with the specified column converted to PCDM
    """
    if self.multiple_rows_per_message:
        df = self.data.withColumn(
            self.source_column_name,
            from_json(col(self.source_column_name), ArrayType(StringType())),
        ).withColumn(self.source_column_name, explode(self.source_column_name))
    else:
        df = self.data.withColumn(
            self.source_column_name,
            from_json(col(self.source_column_name), StringType()),
        )

    if self.filter != None:
        df = df.where(self.filter)

    df = (
        df.withColumn(
            "OPCUA", from_json(col(self.source_column_name), OPC_PUBLISHER_SCHEMA)
        )
        .withColumn("TagName", (col("OPCUA.{}".format(self.tagname_field))))
        .withColumn(
            "EventTime",
            coalesce(
                *[
                    to_timestamp(col("OPCUA.Value.SourceTimestamp"), f)
                    for f in self.timestamp_formats
                ]
            ),
        )
        .withColumn("Value", col("OPCUA.Value.Value"))
        .withColumn(
            "ValueType",
            when(col("Value").cast("float").isNotNull(), "float")
            .when(col("Value").cast("float").isNull(), "string")
            .otherwise("unknown"),
        )
        .withColumn("ChangeType", lit(self.change_type_value))
    )

    status_col_name = "OPCUA.Value.StatusCode.Symbol"
    if self.status_null_value != None:
        df = df.withColumn(
            "Status",
            when(col(status_col_name).isNotNull(), col(status_col_name)).otherwise(
                lit(self.status_null_value)
            ),
        )
    else:
        df = df.withColumn("Status", col(status_col_name))

    return df.select(
        "TagName", "EventTime", "Status", "Value", "ValueType", "ChangeType"
    )

OPCPublisherOPCAEJsonToPCDMTransformer

Bases: TransformerInterface

Converts a Spark Dataframe column containing a json string created by OPC Publisher for A&E(Alarm &Events) data to the Process Control Data Model.

Example

from rtdip_sdk.pipelines.transformers import OPCPublisherOPCAEJsonToPCDMTransformer

opc_publisher_opcae_json_to_pcdm_transformer = OPCPublisherOPCAEJsonToPCDMTransformer(
    data=df,
    souce_column_name="body",
    timestamp_formats=[
        "yyyy-MM-dd'T'HH:mm:ss.SSSX",
        "yyyy-MM-dd'T'HH:mm:ssX"
    ],
    filter=None
)

result = opc_publisher_opcae_json_to_pcdm_transformer.transform()

Parameters:

Name Type Description Default
data DataFrame

Dataframe containing the column with Json OPC AE data

required
source_column_name str

Spark Dataframe column containing the OPC Publisher Json OPC AE data

required
timestamp_formats optional list[str]

Specifies the timestamp formats to be used for converting the timestamp string to a Timestamp Type. For more information on formats, refer to this documentation.

None
filter optional str

Enables providing a filter to the data which can be required in certain scenarios. For example, it would be possible to filter on IoT Hub Device Id and Module by providing a filter in SQL format such as systemProperties.iothub-connection-device-id = "<Device Id>" AND systemProperties.iothub-connection-module-id = "<Module>"

None
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/opc_publisher_opcae_json_to_pcdm.py
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
class OPCPublisherOPCAEJsonToPCDMTransformer(TransformerInterface):
    """
    Converts a Spark Dataframe column containing a json string created by OPC Publisher for A&E(Alarm &Events) data to the Process Control Data Model.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import OPCPublisherOPCAEJsonToPCDMTransformer

    opc_publisher_opcae_json_to_pcdm_transformer = OPCPublisherOPCAEJsonToPCDMTransformer(
        data=df,
        souce_column_name="body",
        timestamp_formats=[
            "yyyy-MM-dd'T'HH:mm:ss.SSSX",
            "yyyy-MM-dd'T'HH:mm:ssX"
        ],
        filter=None
    )

    result = opc_publisher_opcae_json_to_pcdm_transformer.transform()
    ```

    Parameters:
        data (DataFrame): Dataframe containing the column with Json OPC AE data
        source_column_name (str): Spark Dataframe column containing the OPC Publisher Json OPC AE data
        timestamp_formats (optional list[str]): Specifies the timestamp formats to be used for converting the timestamp string to a Timestamp Type. For more information on formats, refer to this [documentation.](https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html)
        filter (optional str): Enables providing a filter to the data which can be required in certain scenarios. For example, it would be possible to filter on IoT Hub Device Id and Module by providing a filter in SQL format such as `systemProperties.iothub-connection-device-id = "<Device Id>" AND systemProperties.iothub-connection-module-id = "<Module>"`
    """

    data: DataFrame
    source_column_name: str
    timestamp_formats: list
    filter: str

    def __init__(
        self,
        data: DataFrame,
        source_column_name: str,
        timestamp_formats=None,
        filter: str = None,
    ) -> None:  # NOSONAR
        self.data = data
        self.source_column_name = source_column_name
        self.timestamp_formats = timestamp_formats or [
            "yyyy-MM-dd'T'HH:mm:ss.SSSX",
            "yyyy-MM-dd'T'HH:mm:ssX",
        ]
        self.filter = filter

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self):
        return True

    def transform(self) -> DataFrame:
        """
        Returns:
            DataFrame: A dataframe with the OPC Publisher A&E data converted to the Process Control Data Model
        """

        df = self.data.withColumn(
            self.source_column_name,
            from_json(col(self.source_column_name), ArrayType(StringType())),
        ).withColumn(self.source_column_name, explode(self.source_column_name))

        if self.filter != None:
            df = df.where(self.filter)

        df = df.withColumn(
            "OPCAE", from_json(col(self.source_column_name), OPC_PUBLISHER_AE_SCHEMA)
        )

        df = df.select(
            col("OPCAE.NodeId"),
            col("OPCAE.DisplayName"),
            col("OPCAE.Value.ConditionId.Value").alias("ConditionId"),
            col("OPCAE.Value.AckedState.Value").alias("AckedState"),
            col("OPCAE.Value.AckedState/FalseState.Value").alias(
                "AckedState/FalseState"
            ),
            col("OPCAE.Value.AckedState/Id.Value").alias("AckedState/Id"),
            col("OPCAE.Value.AckedState/TrueState.Value").alias("AckedState/TrueState"),
            col("OPCAE.Value.ActiveState.Value").alias("ActiveState"),
            col("OPCAE.Value.ActiveState/FalseState.Value").alias(
                "ActiveState/FalseState"
            ),
            col("OPCAE.Value.ActiveState/Id.Value").alias("ActiveState/Id"),
            col("OPCAE.Value.ActiveState/TrueState.Value").alias(
                "ActiveState/TrueState"
            ),
            col("OPCAE.Value.EnabledState.Value").alias("EnabledState"),
            col("OPCAE.Value.EnabledState/FalseState.Value").alias(
                "EnabledState/FalseState"
            ),
            col("OPCAE.Value.EnabledState/Id.Value").alias("EnabledState/Id"),
            col("OPCAE.Value.EnabledState/TrueState.Value").alias(
                "EnabledState/TrueState"
            ),
            col("OPCAE.Value.EventId.Value").alias("EventId"),
            col("OPCAE.Value.EventType.Value").alias("EventType"),
            col("OPCAE.Value.HighHighLimit.Value").alias("HighHighLimit"),
            col("OPCAE.Value.HighLimit.Value").alias("HighLimit"),
            col("OPCAE.Value.InputNode.Value").alias("InputNode"),
            col("OPCAE.Value.LowLimit.Value").alias("LowLimit"),
            col("OPCAE.Value.LowLowLimit.Value").alias("LowLowLimit"),
            col("OPCAE.Value.Message.Value").alias("Message"),
            col("OPCAE.Value.Quality.Value").alias("Quality"),
            col("OPCAE.Value.ReceiveTime.Value").alias("ReceiveTime"),
            col("OPCAE.Value.Retain.Value").alias("Retain"),
            col("OPCAE.Value.Severity.Value").alias("Severity"),
            col("OPCAE.Value.SourceName.Value").alias("SourceName"),
            col("OPCAE.Value.SourceNode.Value").alias("SourceNode"),
            col("OPCAE.Value.Time.Value").alias("EventTime"),
        )

        df = df.withColumn(
            "EventTime",
            coalesce(
                *[to_timestamp(col("EventTime"), f) for f in self.timestamp_formats]
            ),
        )

        return df

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/opc_publisher_opcae_json_to_pcdm.py
79
80
81
82
83
84
85
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

transform()

Returns:

Name Type Description
DataFrame DataFrame

A dataframe with the OPC Publisher A&E data converted to the Process Control Data Model

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/opc_publisher_opcae_json_to_pcdm.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
def transform(self) -> DataFrame:
    """
    Returns:
        DataFrame: A dataframe with the OPC Publisher A&E data converted to the Process Control Data Model
    """

    df = self.data.withColumn(
        self.source_column_name,
        from_json(col(self.source_column_name), ArrayType(StringType())),
    ).withColumn(self.source_column_name, explode(self.source_column_name))

    if self.filter != None:
        df = df.where(self.filter)

    df = df.withColumn(
        "OPCAE", from_json(col(self.source_column_name), OPC_PUBLISHER_AE_SCHEMA)
    )

    df = df.select(
        col("OPCAE.NodeId"),
        col("OPCAE.DisplayName"),
        col("OPCAE.Value.ConditionId.Value").alias("ConditionId"),
        col("OPCAE.Value.AckedState.Value").alias("AckedState"),
        col("OPCAE.Value.AckedState/FalseState.Value").alias(
            "AckedState/FalseState"
        ),
        col("OPCAE.Value.AckedState/Id.Value").alias("AckedState/Id"),
        col("OPCAE.Value.AckedState/TrueState.Value").alias("AckedState/TrueState"),
        col("OPCAE.Value.ActiveState.Value").alias("ActiveState"),
        col("OPCAE.Value.ActiveState/FalseState.Value").alias(
            "ActiveState/FalseState"
        ),
        col("OPCAE.Value.ActiveState/Id.Value").alias("ActiveState/Id"),
        col("OPCAE.Value.ActiveState/TrueState.Value").alias(
            "ActiveState/TrueState"
        ),
        col("OPCAE.Value.EnabledState.Value").alias("EnabledState"),
        col("OPCAE.Value.EnabledState/FalseState.Value").alias(
            "EnabledState/FalseState"
        ),
        col("OPCAE.Value.EnabledState/Id.Value").alias("EnabledState/Id"),
        col("OPCAE.Value.EnabledState/TrueState.Value").alias(
            "EnabledState/TrueState"
        ),
        col("OPCAE.Value.EventId.Value").alias("EventId"),
        col("OPCAE.Value.EventType.Value").alias("EventType"),
        col("OPCAE.Value.HighHighLimit.Value").alias("HighHighLimit"),
        col("OPCAE.Value.HighLimit.Value").alias("HighLimit"),
        col("OPCAE.Value.InputNode.Value").alias("InputNode"),
        col("OPCAE.Value.LowLimit.Value").alias("LowLimit"),
        col("OPCAE.Value.LowLowLimit.Value").alias("LowLowLimit"),
        col("OPCAE.Value.Message.Value").alias("Message"),
        col("OPCAE.Value.Quality.Value").alias("Quality"),
        col("OPCAE.Value.ReceiveTime.Value").alias("ReceiveTime"),
        col("OPCAE.Value.Retain.Value").alias("Retain"),
        col("OPCAE.Value.Severity.Value").alias("Severity"),
        col("OPCAE.Value.SourceName.Value").alias("SourceName"),
        col("OPCAE.Value.SourceNode.Value").alias("SourceNode"),
        col("OPCAE.Value.Time.Value").alias("EventTime"),
    )

    df = df.withColumn(
        "EventTime",
        coalesce(
            *[to_timestamp(col("EventTime"), f) for f in self.timestamp_formats]
        ),
    )

    return df

FledgeOPCUAJsonToPCDMTransformer

Bases: TransformerInterface

Converts a Spark Dataframe column containing a json string created by Fledge to the Process Control Data Model.

Example

from rtdip_sdk.pipelines.transformers import FledgeOPCUAJsonToPCDMTransformer

fledge_opcua_json_to_pcdm_transfromer = FledgeOPCUAJsonToPCDMTransformer(
    data=df,
    souce_column_name="body",
    status_null_value="Good",
    change_type_value="insert",
    timestamp_formats=[
        "yyyy-MM-dd'T'HH:mm:ss.SSSX",
        "yyyy-MM-dd'T'HH:mm:ssX",
    ]
)

result = fledge_opcua_json_to_pcdm_transfromer.transform()

Parameters:

Name Type Description Default
data DataFrame

Dataframe containing the column with Json Fledge data

required
source_column_name str

Spark Dataframe column containing the OPC Publisher Json OPC UA data

required
status_null_value str

If populated, will replace 'Good' in the Status column with the specified value.

'Good'
change_type_value optional str

If populated, will replace 'insert' in the ChangeType column with the specified value.

'insert'
timestamp_formats list[str]

Specifies the timestamp formats to be used for converting the timestamp string to a Timestamp Type. For more information on formats, refer to this documentation.

["yyyy-MM-dd'T'HH:mm:ss.SSSX", "yyyy-MM-dd'T'HH:mm:ssX"]
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/fledge_opcua_json_to_pcdm.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
class FledgeOPCUAJsonToPCDMTransformer(TransformerInterface):
    """
    Converts a Spark Dataframe column containing a json string created by Fledge to the Process Control Data Model.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import FledgeOPCUAJsonToPCDMTransformer

    fledge_opcua_json_to_pcdm_transfromer = FledgeOPCUAJsonToPCDMTransformer(
        data=df,
        souce_column_name="body",
        status_null_value="Good",
        change_type_value="insert",
        timestamp_formats=[
            "yyyy-MM-dd'T'HH:mm:ss.SSSX",
            "yyyy-MM-dd'T'HH:mm:ssX",
        ]
    )

    result = fledge_opcua_json_to_pcdm_transfromer.transform()
    ```

    Parameters:
        data (DataFrame): Dataframe containing the column with Json Fledge data
        source_column_name (str): Spark Dataframe column containing the OPC Publisher Json OPC UA data
        status_null_value (str): If populated, will replace 'Good' in the Status column with the specified value.
        change_type_value (optional str): If populated, will replace 'insert' in the ChangeType column with the specified value.
        timestamp_formats (list[str]): Specifies the timestamp formats to be used for converting the timestamp string to a Timestamp Type. For more information on formats, refer to this [documentation.](https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html)
    """

    data: DataFrame
    source_column_name: str
    status_null_value: str
    change_type_value: str
    timestamp_formats: list

    def __init__(
        self,
        data: DataFrame,
        source_column_name: str,
        status_null_value: str = "Good",
        change_type_value: str = "insert",
        timestamp_formats: list = [
            "yyyy-MM-dd'T'HH:mm:ss.SSSX",
            "yyyy-MM-dd'T'HH:mm:ssX",
        ],
    ) -> None:  # NOSONAR
        self.data = data
        self.source_column_name = source_column_name
        self.status_null_value = status_null_value
        self.change_type_value = change_type_value
        self.timestamp_formats = timestamp_formats

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self):
        return True

    def transform(self) -> DataFrame:
        """
        Returns:
            DataFrame: A dataframe with the specified column converted to PCDM
        """
        df = (
            self.data.withColumn(
                self.source_column_name,
                from_json(self.source_column_name, FLEDGE_SCHEMA),
            )
            .selectExpr("inline({})".format(self.source_column_name))
            .select(explode("readings"), "timestamp")
            .withColumn(
                "EventTime",
                coalesce(
                    *[to_timestamp(col("timestamp"), f) for f in self.timestamp_formats]
                ),
            )
            .withColumnRenamed("key", "TagName")
            .withColumnRenamed("value", "Value")
            .withColumn("Status", lit(self.status_null_value))
            .withColumn(
                "ValueType",
                when(col("value").cast("float").isNotNull(), "float").when(
                    col("value").cast("float").isNull(), "string"
                ),
            )
            .withColumn("ChangeType", lit(self.change_type_value))
        )

        return df.select(
            "TagName", "EventTime", "Status", "Value", "ValueType", "ChangeType"
        )

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/fledge_opcua_json_to_pcdm.py
85
86
87
88
89
90
91
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

transform()

Returns:

Name Type Description
DataFrame DataFrame

A dataframe with the specified column converted to PCDM

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/fledge_opcua_json_to_pcdm.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def transform(self) -> DataFrame:
    """
    Returns:
        DataFrame: A dataframe with the specified column converted to PCDM
    """
    df = (
        self.data.withColumn(
            self.source_column_name,
            from_json(self.source_column_name, FLEDGE_SCHEMA),
        )
        .selectExpr("inline({})".format(self.source_column_name))
        .select(explode("readings"), "timestamp")
        .withColumn(
            "EventTime",
            coalesce(
                *[to_timestamp(col("timestamp"), f) for f in self.timestamp_formats]
            ),
        )
        .withColumnRenamed("key", "TagName")
        .withColumnRenamed("value", "Value")
        .withColumn("Status", lit(self.status_null_value))
        .withColumn(
            "ValueType",
            when(col("value").cast("float").isNotNull(), "float").when(
                col("value").cast("float").isNull(), "string"
            ),
        )
        .withColumn("ChangeType", lit(self.change_type_value))
    )

    return df.select(
        "TagName", "EventTime", "Status", "Value", "ValueType", "ChangeType"
    )

SSIPPIBinaryFileToPCDMTransformer

Bases: TransformerInterface

Converts a Spark DataFrame column containing binaryFile parquet data to the Process Control Data Model.

This DataFrame should contain a path and the binary data. Typically this can be done using the Autoloader source component and specify "binaryFile" as the format.

For more information about the SSIP PI Batch Connector, please see here.

Example

from rtdip_sdk.pipelines.transformers import SSIPPIBinaryFileToPCDMTransformer

ssip_pi_binary_file_to_pcdm_transformer = SSIPPIBinaryFileToPCDMTransformer(
    data=df
)

result = ssip_pi_binary_file_to_pcdm_transformer.transform()

Parameters:

Name Type Description Default
data DataFrame

DataFrame containing the path and binaryFile data

required
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/ssip_pi_binary_file_to_pcdm.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
class SSIPPIBinaryFileToPCDMTransformer(TransformerInterface):
    """
    Converts a Spark DataFrame column containing binaryFile parquet data to the Process Control Data Model.

    This DataFrame should contain a path and the binary data. Typically this can be done using the Autoloader source component and specify "binaryFile" as the format.

    For more information about the SSIP PI Batch Connector, please see [here.](https://bakerhughesc3.ai/oai-solution/shell-sensor-intelligence-platform/)

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import SSIPPIBinaryFileToPCDMTransformer

    ssip_pi_binary_file_to_pcdm_transformer = SSIPPIBinaryFileToPCDMTransformer(
        data=df
    )

    result = ssip_pi_binary_file_to_pcdm_transformer.transform()
    ```

    Parameters:
        data (DataFrame): DataFrame containing the path and binaryFile data
    """

    data: DataFrame

    def __init__(self, data: DataFrame) -> None:
        self.data = data

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        libraries.add_pypi_library(get_default_package("pyarrow"))
        libraries.add_pypi_library(get_default_package("pandas"))
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self):
        return True

    @staticmethod
    def _convert_binary_to_pandas(pdf):
        try:
            binary_list = pdf.values.tolist()
            binary_data = binary_list[0][3]
            buf = pa.py_buffer(binary_data)
            table = pq.read_table(buf)
        except Exception as e:
            print(str(e))
            return pd.DataFrame(
                {
                    "EventDate": pd.Series([], dtype="datetime64[ns]"),
                    "TagName": pd.Series([], dtype="str"),
                    "EventTime": pd.Series([], dtype="datetime64[ns]"),
                    "Status": pd.Series([], dtype="str"),
                    "Value": pd.Series([], dtype="str"),
                    "ValueType": pd.Series([], dtype="str"),
                    "ChangeType": pd.Series([], dtype="str"),
                }
            )

        output_pdf = table.to_pandas()

        if "ValueType" not in output_pdf.columns:
            value_type = str(table.schema.field("Value").type)
            if value_type == "int16" or value_type == "int32":
                value_type = "integer"
            output_pdf["ValueType"] = value_type

        if "ChangeType" not in output_pdf.columns:
            output_pdf["ChangeType"] = "insert"

        output_pdf["EventDate"] = output_pdf["EventTime"].dt.date
        output_pdf["Value"] = output_pdf["Value"].astype(str)
        output_pdf = output_pdf[
            [
                "EventDate",
                "TagName",
                "EventTime",
                "Status",
                "Value",
                "ValueType",
                "ChangeType",
            ]
        ]
        return output_pdf

    def transform(self) -> DataFrame:
        """
        Returns:
            DataFrame: A dataframe with the provided Binary data convert to PCDM
        """
        return self.data.groupBy("path").applyInPandas(
            SSIPPIBinaryFileToPCDMTransformer._convert_binary_to_pandas,
            schema="EventDate DATE, TagName STRING, EventTime TIMESTAMP, Status STRING, Value STRING, ValueType STRING, ChangeType STRING",
        )

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/ssip_pi_binary_file_to_pcdm.py
54
55
56
57
58
59
60
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

transform()

Returns:

Name Type Description
DataFrame DataFrame

A dataframe with the provided Binary data convert to PCDM

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/ssip_pi_binary_file_to_pcdm.py
126
127
128
129
130
131
132
133
134
def transform(self) -> DataFrame:
    """
    Returns:
        DataFrame: A dataframe with the provided Binary data convert to PCDM
    """
    return self.data.groupBy("path").applyInPandas(
        SSIPPIBinaryFileToPCDMTransformer._convert_binary_to_pandas,
        schema="EventDate DATE, TagName STRING, EventTime TIMESTAMP, Status STRING, Value STRING, ValueType STRING, ChangeType STRING",
    )

SSIPPIJsonStreamToPCDMTransformer

Bases: TransformerInterface

Converts a Spark DataFrame containing Binary JSON data and related Properties to the Process Control Data Model

For more information about the SSIP PI Streaming Connector, please see here.

Example

from rtdip_sdk.pipelines.transformers import SSIPPIJsonStreamToPCDMTransformer
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

ssip_pi_json_stream_to_pcdm_transformer = SSIPPIJsonStreamToPCDMTransformer(
    spark=spark,
    data=df,
    source_column_name="body",
    properties_column_name="",
    metadata_delta_table=None
)

result = ssip_pi_json_stream_to_pcdm_transformer.transform()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session

required
data DataFrame

DataFrame containing the path and binaryFile data

required
source_column_name str

Spark Dataframe column containing the Binary json data

required
properties_column_name str

Spark Dataframe struct typed column containing an element with the PointType

required
metadata_delta_table (optional, str)

Name of a metadata table that can be used for PointType mappings

None
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/ssip_pi_binary_json_to_pcdm.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
class SSIPPIJsonStreamToPCDMTransformer(TransformerInterface):
    """
    Converts a Spark DataFrame containing Binary JSON data and related Properties to the Process Control Data Model

    For more information about the SSIP PI Streaming Connector, please see [here.](https://bakerhughesc3.ai/oai-solution/shell-sensor-intelligence-platform/)

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import SSIPPIJsonStreamToPCDMTransformer
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    ssip_pi_json_stream_to_pcdm_transformer = SSIPPIJsonStreamToPCDMTransformer(
        spark=spark,
        data=df,
        source_column_name="body",
        properties_column_name="",
        metadata_delta_table=None
    )

    result = ssip_pi_json_stream_to_pcdm_transformer.transform()
    ```

    Parameters:
        spark (SparkSession): Spark Session
        data (DataFrame): DataFrame containing the path and binaryFile data
        source_column_name (str): Spark Dataframe column containing the Binary json data
        properties_column_name (str): Spark Dataframe struct typed column containing an element with the PointType
        metadata_delta_table (optional, str): Name of a metadata table that can be used for PointType mappings
    """

    spark: SparkSession
    data: DataFrame
    source_column_name: str
    properties_column_name: str
    metadata_delta_table: str

    def __init__(
        self,
        spark: SparkSession,
        data: DataFrame,
        source_column_name: str,
        properties_column_name: str,
        metadata_delta_table: str = None,
    ) -> None:
        self.spark = spark
        self.data = data
        self.source_column_name = source_column_name
        self.properties_column_name = properties_column_name
        self.metadata_delta_table = metadata_delta_table

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self):
        return True

    def transform(self) -> DataFrame:
        """
        Returns:
            DataFrame: A dataframe with the provided Binary data converted to PCDM
        """
        df = (
            self.data.withColumn(
                self.source_column_name, col(self.source_column_name).cast("string")
            )
            .withColumn(
                "EventDate",
                get_json_object(col(self.source_column_name), "$.EventTime").cast(
                    "date"
                ),
            )
            .withColumn(
                "TagName",
                get_json_object(col(self.source_column_name), "$.TagName").cast(
                    "string"
                ),
            )
            .withColumn(
                "EventTime",
                get_json_object(col(self.source_column_name), "$.EventTime").cast(
                    "timestamp"
                ),
            )
            .withColumn(
                "Status",
                get_json_object(col(self.source_column_name), "$.Quality").cast(
                    "string"
                ),
            )
            .withColumn(
                "Value",
                get_json_object(col(self.source_column_name), "$.Value").cast("string"),
            )
            .withColumn(
                "PointType", element_at(col(self.properties_column_name), "PointType")
            )
            .withColumn(
                "Action",
                element_at(col(self.properties_column_name), "Action").cast("string"),
            )
        )

        if self.metadata_delta_table != None:
            metadata_df = SparkDeltaSource(
                self.spark, {}, self.metadata_delta_table
            ).read_batch()
            metadata_df = metadata_df.select(
                "TagName", col("PointType").alias("MetadataPointType")
            )
            df = df.join(metadata_df, (df.TagName == metadata_df.TagName), "left")
            df = df.withColumn(
                "PointType",
                (when(col("PointType").isNull(), col("MetadataPointType"))).otherwise(
                    col("PointType")
                ),
            )

        return (
            df.withColumn(
                "ValueType",
                (
                    when(col("PointType") == "Digital", "string")
                    .when(col("PointType") == "String", "string")
                    .when(col("PointType") == "Float16", "float")
                    .when(col("PointType") == "Float32", "float")
                    .when(col("PointType") == "Float64", "float")
                    .when(col("PointType") == "Int16", "integer")
                    .when(col("PointType") == "Int32", "integer")
                    .otherwise("string")
                ),
            )
            .selectExpr(
                "*",
                "CASE WHEN ValueType = 'integer' THEN try_cast(Value as integer) END as Value_Integer",
                "CASE WHEN ValueType = 'float' THEN try_cast(Value as float) END as Value_Float",
            )
            .withColumn(
                "ValueType",
                when(
                    (col("Value_Integer").isNull()) & (col("ValueType") == "integer"),
                    "string",
                )
                .when(
                    (col("Value_Float").isNull()) & (col("ValueType") == "float"),
                    "string",
                )
                .otherwise(col("ValueType")),
            )
            .withColumn(
                "ChangeType",
                (
                    when(col("Action") == "Insert", "insert")
                    .when(col("Action") == "Add", "insert")
                    .when(col("Action") == "Delete", "delete")
                    .when(col("Action") == "Update", "update")
                    .when(col("Action") == "Refresh", "update")
                ),
            )
            .select(
                col("EventDate"),
                col("TagName"),
                col("EventTime"),
                col("Status"),
                col("Value"),
                col("ValueType"),
                col("ChangeType"),
            )
        )

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/ssip_pi_binary_json_to_pcdm.py
77
78
79
80
81
82
83
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

transform()

Returns:

Name Type Description
DataFrame DataFrame

A dataframe with the provided Binary data converted to PCDM

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/ssip_pi_binary_json_to_pcdm.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def transform(self) -> DataFrame:
    """
    Returns:
        DataFrame: A dataframe with the provided Binary data converted to PCDM
    """
    df = (
        self.data.withColumn(
            self.source_column_name, col(self.source_column_name).cast("string")
        )
        .withColumn(
            "EventDate",
            get_json_object(col(self.source_column_name), "$.EventTime").cast(
                "date"
            ),
        )
        .withColumn(
            "TagName",
            get_json_object(col(self.source_column_name), "$.TagName").cast(
                "string"
            ),
        )
        .withColumn(
            "EventTime",
            get_json_object(col(self.source_column_name), "$.EventTime").cast(
                "timestamp"
            ),
        )
        .withColumn(
            "Status",
            get_json_object(col(self.source_column_name), "$.Quality").cast(
                "string"
            ),
        )
        .withColumn(
            "Value",
            get_json_object(col(self.source_column_name), "$.Value").cast("string"),
        )
        .withColumn(
            "PointType", element_at(col(self.properties_column_name), "PointType")
        )
        .withColumn(
            "Action",
            element_at(col(self.properties_column_name), "Action").cast("string"),
        )
    )

    if self.metadata_delta_table != None:
        metadata_df = SparkDeltaSource(
            self.spark, {}, self.metadata_delta_table
        ).read_batch()
        metadata_df = metadata_df.select(
            "TagName", col("PointType").alias("MetadataPointType")
        )
        df = df.join(metadata_df, (df.TagName == metadata_df.TagName), "left")
        df = df.withColumn(
            "PointType",
            (when(col("PointType").isNull(), col("MetadataPointType"))).otherwise(
                col("PointType")
            ),
        )

    return (
        df.withColumn(
            "ValueType",
            (
                when(col("PointType") == "Digital", "string")
                .when(col("PointType") == "String", "string")
                .when(col("PointType") == "Float16", "float")
                .when(col("PointType") == "Float32", "float")
                .when(col("PointType") == "Float64", "float")
                .when(col("PointType") == "Int16", "integer")
                .when(col("PointType") == "Int32", "integer")
                .otherwise("string")
            ),
        )
        .selectExpr(
            "*",
            "CASE WHEN ValueType = 'integer' THEN try_cast(Value as integer) END as Value_Integer",
            "CASE WHEN ValueType = 'float' THEN try_cast(Value as float) END as Value_Float",
        )
        .withColumn(
            "ValueType",
            when(
                (col("Value_Integer").isNull()) & (col("ValueType") == "integer"),
                "string",
            )
            .when(
                (col("Value_Float").isNull()) & (col("ValueType") == "float"),
                "string",
            )
            .otherwise(col("ValueType")),
        )
        .withColumn(
            "ChangeType",
            (
                when(col("Action") == "Insert", "insert")
                .when(col("Action") == "Add", "insert")
                .when(col("Action") == "Delete", "delete")
                .when(col("Action") == "Update", "update")
                .when(col("Action") == "Refresh", "update")
            ),
        )
        .select(
            col("EventDate"),
            col("TagName"),
            col("EventTime"),
            col("Status"),
            col("Value"),
            col("ValueType"),
            col("ChangeType"),
        )
    )

AIOJsonToPCDMTransformer

Bases: TransformerInterface

Converts a Spark Dataframe column containing a json string created by AIO to the Process Control Data Model.

Example

from rtdip_sdk.pipelines.transformers import AIOJsonToPCDMTransformer

aio_json_to_pcdm_transfromer = AIOJsonToPCDMTransformer(
    data=df,
    souce_column_name="body",
    status_null_value="Good",
    change_type_value="insert"
)

result = aio_json_to_pcdm_transfromer.transform()

Parameters:

Name Type Description Default
data DataFrame

Dataframe containing the column with Json AIO data

required
source_column_name str

Spark Dataframe column containing the Json AIO data

required
status_null_value str

If populated, will replace 'Good' in the Status column with the specified value.

'Good'
change_type_value optional str

If populated, will replace 'insert' in the ChangeType column with the specified value.

'insert'
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/aio_json_to_pcdm.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
class AIOJsonToPCDMTransformer(TransformerInterface):
    """
    Converts a Spark Dataframe column containing a json string created by AIO to the Process Control Data Model.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import AIOJsonToPCDMTransformer

    aio_json_to_pcdm_transfromer = AIOJsonToPCDMTransformer(
        data=df,
        souce_column_name="body",
        status_null_value="Good",
        change_type_value="insert"
    )

    result = aio_json_to_pcdm_transfromer.transform()
    ```

    Parameters:
        data (DataFrame): Dataframe containing the column with Json AIO data
        source_column_name (str): Spark Dataframe column containing the Json AIO data
        status_null_value (str): If populated, will replace 'Good' in the Status column with the specified value.
        change_type_value (optional str): If populated, will replace 'insert' in the ChangeType column with the specified value.
    """

    data: DataFrame
    source_column_name: str
    status_null_value: str
    change_type_value: str

    def __init__(
        self,
        data: DataFrame,
        source_column_name: str,
        status_null_value: str = "Good",
        change_type_value: str = "insert",
    ) -> None:  # NOSONAR
        self.data = data
        self.source_column_name = source_column_name
        self.status_null_value = status_null_value
        self.change_type_value = change_type_value

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self):
        return True

    def transform(self) -> DataFrame:
        """
        Returns:
            DataFrame: A dataframe with the specified column converted to PCDM
        """
        df = (
            self.data.select(
                from_json(col(self.source_column_name), "Payload STRING").alias("body")
            )
            .select(from_json(expr("body.Payload"), AIO_SCHEMA).alias("body"))
            .select(explode("body"))
            .select(col("key").alias("TagName"), "value.*")
            .select(col("SourceTimestamp").alias("EventTime"), "TagName", "Value")
            .withColumn("Status", lit(self.status_null_value))
            .withColumn(
                "ValueType",
                when(col("Value").cast("float").isNotNull(), "float").otherwise(
                    "string"
                ),
            )
            .withColumn("ChangeType", lit(self.change_type_value))
        )

        return df.select(
            "EventTime", "TagName", "Status", "Value", "ValueType", "ChangeType"
        )

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/aio_json_to_pcdm.py
66
67
68
69
70
71
72
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

transform()

Returns:

Name Type Description
DataFrame DataFrame

A dataframe with the specified column converted to PCDM

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/aio_json_to_pcdm.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def transform(self) -> DataFrame:
    """
    Returns:
        DataFrame: A dataframe with the specified column converted to PCDM
    """
    df = (
        self.data.select(
            from_json(col(self.source_column_name), "Payload STRING").alias("body")
        )
        .select(from_json(expr("body.Payload"), AIO_SCHEMA).alias("body"))
        .select(explode("body"))
        .select(col("key").alias("TagName"), "value.*")
        .select(col("SourceTimestamp").alias("EventTime"), "TagName", "Value")
        .withColumn("Status", lit(self.status_null_value))
        .withColumn(
            "ValueType",
            when(col("Value").cast("float").isNotNull(), "float").otherwise(
                "string"
            ),
        )
        .withColumn("ChangeType", lit(self.change_type_value))
    )

    return df.select(
        "EventTime", "TagName", "Status", "Value", "ValueType", "ChangeType"
    )

OPCUAJsonToPCDMTransformer

Bases: TransformerInterface

Converts a Spark Dataframe column containing a json string created by Open Source OPC UA to the Process Control Data Model.

Example

from rtdip_sdk.pipelines.transformers import OPCUAJsonToPCDMTransformer

opcua_json_to_pcdm_transfromer = OPCUAJsonToPCDMTransformer(
    data=df,
    souce_column_name="body",
    status_null_value="Good",
    change_type_value="insert"
)

result = opcua_json_to_pcdm_transfromer.transform()

Parameters:

Name Type Description Default
data DataFrame

Dataframe containing the column with Json OPC UA data

required
source_column_name str

Spark Dataframe column containing the OPC Publisher Json OPC UA data

required
status_null_value str

If populated, will replace 'Good' in the Status column with the specified value.

'Good'
change_type_value optional str

If populated, will replace 'insert' in the ChangeType column with the specified value.

'insert'
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/opcua_json_to_pcdm.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
class OPCUAJsonToPCDMTransformer(TransformerInterface):
    """
    Converts a Spark Dataframe column containing a json string created by Open Source OPC UA to the Process Control Data Model.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import OPCUAJsonToPCDMTransformer

    opcua_json_to_pcdm_transfromer = OPCUAJsonToPCDMTransformer(
        data=df,
        souce_column_name="body",
        status_null_value="Good",
        change_type_value="insert"
    )

    result = opcua_json_to_pcdm_transfromer.transform()
    ```

    Parameters:
        data (DataFrame): Dataframe containing the column with Json OPC UA data
        source_column_name (str): Spark Dataframe column containing the OPC Publisher Json OPC UA data
        status_null_value (str): If populated, will replace 'Good' in the Status column with the specified value.
        change_type_value (optional str): If populated, will replace 'insert' in the ChangeType column with the specified value.
    """

    data: DataFrame
    source_column_name: str
    status_null_value: str
    change_type_value: str

    def __init__(
        self,
        data: DataFrame,
        source_column_name: str,
        status_null_value: str = "Good",
        change_type_value: str = "insert",
    ) -> None:  # NOSONAR
        self.data = data
        self.source_column_name = source_column_name
        self.status_null_value = status_null_value
        self.change_type_value = change_type_value

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self):
        return True

    def transform(self) -> DataFrame:
        """
        Returns:
            DataFrame: A dataframe with the specified column converted to PCDM
        """
        df = (
            self.data.select(
                from_json(col(self.source_column_name), "Messages STRING").alias("body")
            )
            .select(from_json(expr("body.Messages"), OPCUA_SCHEMA).alias("body"))
            .selectExpr("inline(body)")
            .select(col("Timestamp").alias("EventTime"), explode("Payload"))
            .select("EventTime", col("key").alias("TagName"), "value.*")
            .withColumn("Status", lit(self.status_null_value))
            .withColumn(
                "ValueType",
                when(col("Value").cast("float").isNotNull(), "float").otherwise(
                    "string"
                ),
            )
            .withColumn("ChangeType", lit(self.change_type_value))
        )

        return df.select(
            "EventTime", "TagName", "Status", "Value", "ValueType", "ChangeType"
        )

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/opcua_json_to_pcdm.py
66
67
68
69
70
71
72
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

transform()

Returns:

Name Type Description
DataFrame DataFrame

A dataframe with the specified column converted to PCDM

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/opcua_json_to_pcdm.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def transform(self) -> DataFrame:
    """
    Returns:
        DataFrame: A dataframe with the specified column converted to PCDM
    """
    df = (
        self.data.select(
            from_json(col(self.source_column_name), "Messages STRING").alias("body")
        )
        .select(from_json(expr("body.Messages"), OPCUA_SCHEMA).alias("body"))
        .selectExpr("inline(body)")
        .select(col("Timestamp").alias("EventTime"), explode("Payload"))
        .select("EventTime", col("key").alias("TagName"), "value.*")
        .withColumn("Status", lit(self.status_null_value))
        .withColumn(
            "ValueType",
            when(col("Value").cast("float").isNotNull(), "float").otherwise(
                "string"
            ),
        )
        .withColumn("ChangeType", lit(self.change_type_value))
    )

    return df.select(
        "EventTime", "TagName", "Status", "Value", "ValueType", "ChangeType"
    )

CAISOToMDMTransformer

Bases: BaseRawToMDMTransformer

Converts CAISO Raw data into Meters Data Model.

Please check the BaseRawToMDMTransformer for the required arguments and methods.

Example

from rtdip_sdk.pipelines.transformers import CAISOToMDMTransformer
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

caiso_to_mdm_transformer = CAISOToMDMTransformer(
    spark=spark,
    data=df,
    output_type="usage",
    name=None,
    description=None,
    value_type=None,
    version=None,
    series_id=None,
    series_parent_id=None
)

result = caiso_to_mdm_transformer.transform()
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/caiso_to_mdm.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class CAISOToMDMTransformer(BaseRawToMDMTransformer):
    """
    Converts CAISO Raw data into Meters Data Model.

    Please check the BaseRawToMDMTransformer for the required arguments and methods.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import CAISOToMDMTransformer
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    caiso_to_mdm_transformer = CAISOToMDMTransformer(
        spark=spark,
        data=df,
        output_type="usage",
        name=None,
        description=None,
        value_type=None,
        version=None,
        series_id=None,
        series_parent_id=None
    )

    result = caiso_to_mdm_transformer.transform()
    ```
    """

    spark: SparkSession
    data: DataFrame
    input_schema = CAISO_SCHEMA
    uid_col = "TacAreaName"
    series_id_col = "'series_std_001'"
    timestamp_col = "to_timestamp(StartTime)"
    interval_timestamp_col = "Timestamp + INTERVAL 1 HOURS"
    value_col = "Load"
    series_parent_id_col = "'series_parent_std_001'"
    name_col = "'CAISO API'"
    uom_col = "'mwh'"
    description_col = "'CAISO data pulled from CAISO ISO API'"
    timestamp_start_col = "StartTime"
    timestamp_end_col = "StartTime + INTERVAL 1 HOURS"
    time_zone_col = "'PST'"
    version_col = "'1'"
    series_type = SeriesType.Hour
    model_type = ModelType.Default
    value_type = ValueType.Usage
    properties_col = "null"

ERCOTToMDMTransformer

Bases: BaseRawToMDMTransformer

Converts ERCOT Raw data into Meters Data Model.

Please check the BaseRawToMDMTransformer for the required arguments and methods.

Example

from rtdip_sdk.pipelines.transformers import ERCOTToMDMTransformer
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

ercot_to_mdm_transformer = ERCOTToMDMTransformer(
    spark=spark,
    data=df,
    output_type="usage",
    name=None,
    description=None,
    value_type=None,
    version=None,
    series_id=None,
    series_parent_id=None
)

result = ercot_to_mdm_transformer.transform()
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/ercot_to_mdm.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
class ERCOTToMDMTransformer(BaseRawToMDMTransformer):
    """
    Converts ERCOT Raw data into Meters Data Model.

    Please check the BaseRawToMDMTransformer for the required arguments and methods.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import ERCOTToMDMTransformer
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    ercot_to_mdm_transformer = ERCOTToMDMTransformer(
        spark=spark,
        data=df,
        output_type="usage",
        name=None,
        description=None,
        value_type=None,
        version=None,
        series_id=None,
        series_parent_id=None
    )

    result = ercot_to_mdm_transformer.transform()
    ```
    """

    spark: SparkSession
    data: DataFrame
    input_schema = ERCOT_SCHEMA
    uid_col = "variable"
    series_id_col = "'series_std_001'"
    timestamp_col = "to_utc_timestamp(StartTime, 'America/Chicago')"
    interval_timestamp_col = "Timestamp + INTERVAL 1 HOURS"
    value_col = "value"
    series_parent_id_col = "'series_parent_std_001'"
    name_col = "'ERCOT API'"
    uom_col = "'mwh'"
    description_col = "'ERCOT data pulled from ERCOT ISO API'"
    timestamp_start_col = "StartTime"
    timestamp_end_col = "StartTime + INTERVAL 1 HOURS"
    time_zone_col = "'America/Chicago'"
    version_col = "'1'"
    series_type = SeriesType.Hour
    model_type = ModelType.Default
    value_type = ValueType.Usage
    properties_col = "null"

    def _pre_process(self) -> DataFrame:
        df: DataFrame = super(ERCOTToMDMTransformer, self)._pre_process()
        df = melt(
            df,
            id_vars=["Date", "HourEnding", "DstFlag"],
            value_vars=[
                "Coast",
                "East",
                "FarWest",
                "North",
                "NorthCentral",
                "SouthCentral",
                "Southern",
                "West",
                "SystemTotal",
            ],
        )
        df = df.withColumn(
            "StartTime",
            F.expr(
                "Date + MAKE_INTERVAL(0,0,0,0,cast(split(HourEnding,':')[0] as integer),0,0)"
            ),
        )
        return df

MISOToMDMTransformer

Bases: BaseRawToMDMTransformer

Converts MISO Raw data into Meters Data Model.

Please check the BaseRawToMDMTransformer for the required arguments and methods.

Example

from rtdip_sdk.pipelines.transformers import MISOToMDMTransformer
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

miso_to_mdm_transformer = MISOToMDMTransformer(
    spark=spark,
    data=df,
    output_type="usage",
    name=None,
    description=None,
    value_type=None,
    version=None,
    series_id=None,
    series_parent_id=None
)

result = miso_to_mdm_transformer.transform()
BaseRawToMDMTransformer

BaseRawToMDMTransformer

Bases: TransformerInterface

Base class for all the Raw to Meters Data Model Transformers.

Meters Data Model requires two outputs
  • UsageData : To store measurement(value) as timeseries data.
  • MetaData : To store meters related meta information.

It supports the generation of both the outputs as they share some common properties.

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance.

required
data DataFrame

Dataframe containing the raw MISO data.

required
output_type str

Must be one of usage or meta.

required
name str

Set this to override default name column.

None
description str

Set this to override default description column.

None
value_type ValueType

Set this to override default value_type column.

None
version str

Set this to override default version column.

None
series_id str

Set this to override default series_id column.

None
series_parent_id str

Set this to override default series_parent_id column.

None
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/base_raw_to_mdm.py
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
class BaseRawToMDMTransformer(TransformerInterface):
    """
    Base class for all the Raw to Meters Data Model Transformers.

    Meters Data Model requires two outputs:
        - `UsageData` : To store measurement(value) as timeseries data.
        - `MetaData` : To store meters related meta information.

    It supports the generation of both the outputs as they share some common properties.

    Parameters:
        spark (SparkSession): Spark Session instance.
        data (DataFrame): Dataframe containing the raw MISO data.
        output_type (str): Must be one of `usage` or `meta`.
        name (str): Set this to override default `name` column.
        description (str): Set this to override default `description` column.
        value_type (ValueType): Set this to override default `value_type` column.
        version (str): Set this to override default `version` column.
        series_id (str): Set this to override default `series_id` column.
        series_parent_id (str): Set this to override default `series_parent_id` column.
    """

    spark: SparkSession
    data: DataFrame
    output_type: str
    input_schema: StructType
    target_schema: StructType
    uid_col: str
    series_id_col: str
    timestamp_col: str
    interval_timestamp_col: str
    value_col: str
    series_parent_id_col: str
    name_col: str
    uom_col: str
    description_col: str
    timestamp_start_col: str
    timestamp_end_col: str
    time_zone_col: str
    version_col: str
    series_type: SeriesType
    model_type: ModelType
    value_type: ValueType
    properties_col: str

    def __init__(
        self,
        spark: SparkSession,
        data: DataFrame,
        output_type: str,
        name: str = None,
        description: str = None,
        value_type: ValueType = None,
        version: str = None,
        series_id: str = None,
        series_parent_id: str = None,
    ):
        self.spark = spark
        self.data = data
        self.output_type = output_type
        self.name = name if name is not None else self.name_col
        self.description = (
            description if description is not None else self.description_col
        )
        self.value_type = value_type if value_type is not None else self.value_type
        self.version = version if version is not None else self.version_col
        self.series_id = series_id if series_id is not None else self.series_id_col
        self.series_parent_id = (
            series_parent_id
            if series_parent_id is not None
            else self.series_parent_id_col
        )

    @staticmethod
    def system_type():
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self) -> bool:
        valid_output_types = ["usage", "meta"]
        if self.output_type not in valid_output_types:
            raise ValueError(
                f"Invalid output_type `{self.output_type}` given. Must be one of {valid_output_types}"
            )

        assert str(self.data.schema) == str(self.input_schema)
        assert type(self.series_type).__name__ == SeriesType.__name__
        assert type(self.model_type).__name__ == ModelType.__name__
        assert type(self.value_type).__name__ == ValueType.__name__
        return True

    def post_transform_validation(self) -> bool:
        assert str(self.data.schema) == str(self.target_schema)
        return True

    def _get_transformed_df(self) -> DataFrame:
        if self.output_type == "usage":
            self.target_schema = MDM_USAGE_SCHEMA
            return self._get_usage_transformed_df()
        else:
            self.target_schema = MDM_META_SCHEMA
            return self._get_meta_transformed_df()

    def _convert_into_target_schema(self) -> None:
        """
        Converts a Spark DataFrame structure into new structure based on the Target Schema.

        Returns: Nothing.

        """

        df: DataFrame = self.data
        df = df.select(self.target_schema.names)

        for field in self.target_schema.fields:
            df = df.withColumn(field.name, col(field.name).cast(field.dataType))

        self.data = self.spark.createDataFrame(df.rdd, self.target_schema)

    def transform(self) -> DataFrame:
        """
        Returns:
            DataFrame: A dataframe with the raw data converted into MDM.
        """

        self.pre_transform_validation()
        self.data = self._get_transformed_df()
        self._convert_into_target_schema()
        self.post_transform_validation()

        return self.data

    def _add_uid_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("Uid", expr(self.uid_col))

    def _add_series_id_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("SeriesId", expr(self.series_id))

    def _add_timestamp_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("Timestamp", expr(self.timestamp_col))

    def _add_interval_timestamp_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("IntervalTimestamp", expr(self.interval_timestamp_col))

    def _add_value_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("Value", expr(self.value_col))

    def _add_series_parent_id_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("SeriesParentId", expr(self.series_parent_id))

    def _add_name_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("Name", expr(self.name))

    def _add_uom_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("Uom", expr(self.uom_col))

    def _add_description_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("Description", expr(self.description))

    def _add_timestamp_start_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("TimestampStart", expr(self.timestamp_start_col))

    def _add_timestamp_end_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("TimestampEnd", expr(self.timestamp_end_col))

    def _add_time_zone_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("Timezone", expr(self.time_zone_col))

    def _add_version_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("Version", expr(self.version))

    def _add_series_type_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("SeriesType", lit(self.series_type.value))

    def _add_model_type_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("ModelType", lit(self.model_type.value))

    def _add_value_type_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("ValueType", lit(self.value_type.value))

    def _add_properties_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("Properties", expr(self.properties_col))

    def _pre_process(self) -> DataFrame:
        return self.data

    @staticmethod
    def _post_process(df: DataFrame) -> DataFrame:
        return df

    def _get_usage_transformed_df(self) -> DataFrame:
        df = self._pre_process()

        df = self._add_uid_column(df)
        df = self._add_series_id_column(df)
        df = self._add_timestamp_column(df)
        df = self._add_interval_timestamp_column(df)
        df = self._add_value_column(df)

        df = self._post_process(df)

        return df

    def _get_meta_transformed_df(self) -> DataFrame:
        df = self._pre_process()

        df = self._add_uid_column(df)
        df = self._add_series_id_column(df)
        df = self._add_series_parent_id_column(df)
        df = self._add_name_column(df)
        df = self._add_uom_column(df)
        df = self._add_description_column(df)
        df = self._add_timestamp_start_column(df)
        df = self._add_timestamp_end_column(df)
        df = self._add_time_zone_column(df)
        df = self._add_version_column(df)
        df = self._add_series_type_column(df)
        df = self._add_model_type_column(df)
        df = self._add_value_type_column(df)
        df = self._add_properties_column(df)

        df = self._post_process(df)

        return df

transform()

Returns:

Name Type Description
DataFrame DataFrame

A dataframe with the raw data converted into MDM.

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/base_raw_to_mdm.py
153
154
155
156
157
158
159
160
161
162
163
164
def transform(self) -> DataFrame:
    """
    Returns:
        DataFrame: A dataframe with the raw data converted into MDM.
    """

    self.pre_transform_validation()
    self.data = self._get_transformed_df()
    self._convert_into_target_schema()
    self.post_transform_validation()

    return self.data
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/miso_to_mdm.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
class MISOToMDMTransformer(BaseRawToMDMTransformer):
    """
    Converts MISO Raw data into Meters Data Model.

    Please check the BaseRawToMDMTransformer for the required arguments and methods.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import MISOToMDMTransformer
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    miso_to_mdm_transformer = MISOToMDMTransformer(
        spark=spark,
        data=df,
        output_type="usage",
        name=None,
        description=None,
        value_type=None,
        version=None,
        series_id=None,
        series_parent_id=None
    )

    result = miso_to_mdm_transformer.transform()
    ```

    BaseRawToMDMTransformer:
        ::: src.sdk.python.rtdip_sdk.pipelines.transformers.spark.base_raw_to_mdm
    """

    spark: SparkSession
    data: DataFrame
    input_schema = MISO_SCHEMA
    uid_col = "variable"
    series_id_col = "'series_std_001'"
    timestamp_col = "to_utc_timestamp(Datetime, 'US/Central')"
    interval_timestamp_col = "Timestamp + INTERVAL 1 HOURS"
    value_col = "bround(value, 2)"
    series_parent_id_col = "'series_parent_std_001'"
    name_col = "'Miso API'"
    uom_col = "'mwh'"
    description_col = "'Miso data pulled from Miso ISO API'"
    timestamp_start_col = "Datetime"
    timestamp_end_col = "Datetime + INTERVAL 1 HOURS"
    time_zone_col = "'US/Central'"
    version_col = "'1'"
    series_type = SeriesType.Hour
    model_type = ModelType.Default
    value_type = ValueType.Usage
    properties_col = "null"

    def _pre_process(self) -> DataFrame:
        df: DataFrame = super(MISOToMDMTransformer, self)._pre_process()
        df = melt(
            df,
            id_vars=["Datetime"],
            value_vars=[
                "Lrz1",
                "Lrz2_7",
                "Lrz3_5",
                "Lrz4",
                "Lrz6",
                "Lrz8_9_10",
                "Miso",
            ],
        )
        return df

PJMToMDMTransformer

Bases: BaseRawToMDMTransformer

Converts PJM Raw data into Meters Data Model.

Please check the BaseRawToMDMTransformer for the required arguments and methods.

Example

from rtdip_sdk.pipelines.transformers import PJMToMDMTransformer
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

pjm_to_mdm_transformer = PJMToMDMTransformer(
    spark=spark,
    data=df,
    output_type="usage",
    name=None,
    description=None,
    value_type=None,
    version=None,
    series_id=None,
    series_parent_id=None
)

result = pjm_to_mdm_transformer.transform()
BaseRawToMDMTransformer

BaseRawToMDMTransformer

Bases: TransformerInterface

Base class for all the Raw to Meters Data Model Transformers.

Meters Data Model requires two outputs
  • UsageData : To store measurement(value) as timeseries data.
  • MetaData : To store meters related meta information.

It supports the generation of both the outputs as they share some common properties.

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance.

required
data DataFrame

Dataframe containing the raw MISO data.

required
output_type str

Must be one of usage or meta.

required
name str

Set this to override default name column.

None
description str

Set this to override default description column.

None
value_type ValueType

Set this to override default value_type column.

None
version str

Set this to override default version column.

None
series_id str

Set this to override default series_id column.

None
series_parent_id str

Set this to override default series_parent_id column.

None
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/base_raw_to_mdm.py
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
class BaseRawToMDMTransformer(TransformerInterface):
    """
    Base class for all the Raw to Meters Data Model Transformers.

    Meters Data Model requires two outputs:
        - `UsageData` : To store measurement(value) as timeseries data.
        - `MetaData` : To store meters related meta information.

    It supports the generation of both the outputs as they share some common properties.

    Parameters:
        spark (SparkSession): Spark Session instance.
        data (DataFrame): Dataframe containing the raw MISO data.
        output_type (str): Must be one of `usage` or `meta`.
        name (str): Set this to override default `name` column.
        description (str): Set this to override default `description` column.
        value_type (ValueType): Set this to override default `value_type` column.
        version (str): Set this to override default `version` column.
        series_id (str): Set this to override default `series_id` column.
        series_parent_id (str): Set this to override default `series_parent_id` column.
    """

    spark: SparkSession
    data: DataFrame
    output_type: str
    input_schema: StructType
    target_schema: StructType
    uid_col: str
    series_id_col: str
    timestamp_col: str
    interval_timestamp_col: str
    value_col: str
    series_parent_id_col: str
    name_col: str
    uom_col: str
    description_col: str
    timestamp_start_col: str
    timestamp_end_col: str
    time_zone_col: str
    version_col: str
    series_type: SeriesType
    model_type: ModelType
    value_type: ValueType
    properties_col: str

    def __init__(
        self,
        spark: SparkSession,
        data: DataFrame,
        output_type: str,
        name: str = None,
        description: str = None,
        value_type: ValueType = None,
        version: str = None,
        series_id: str = None,
        series_parent_id: str = None,
    ):
        self.spark = spark
        self.data = data
        self.output_type = output_type
        self.name = name if name is not None else self.name_col
        self.description = (
            description if description is not None else self.description_col
        )
        self.value_type = value_type if value_type is not None else self.value_type
        self.version = version if version is not None else self.version_col
        self.series_id = series_id if series_id is not None else self.series_id_col
        self.series_parent_id = (
            series_parent_id
            if series_parent_id is not None
            else self.series_parent_id_col
        )

    @staticmethod
    def system_type():
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self) -> bool:
        valid_output_types = ["usage", "meta"]
        if self.output_type not in valid_output_types:
            raise ValueError(
                f"Invalid output_type `{self.output_type}` given. Must be one of {valid_output_types}"
            )

        assert str(self.data.schema) == str(self.input_schema)
        assert type(self.series_type).__name__ == SeriesType.__name__
        assert type(self.model_type).__name__ == ModelType.__name__
        assert type(self.value_type).__name__ == ValueType.__name__
        return True

    def post_transform_validation(self) -> bool:
        assert str(self.data.schema) == str(self.target_schema)
        return True

    def _get_transformed_df(self) -> DataFrame:
        if self.output_type == "usage":
            self.target_schema = MDM_USAGE_SCHEMA
            return self._get_usage_transformed_df()
        else:
            self.target_schema = MDM_META_SCHEMA
            return self._get_meta_transformed_df()

    def _convert_into_target_schema(self) -> None:
        """
        Converts a Spark DataFrame structure into new structure based on the Target Schema.

        Returns: Nothing.

        """

        df: DataFrame = self.data
        df = df.select(self.target_schema.names)

        for field in self.target_schema.fields:
            df = df.withColumn(field.name, col(field.name).cast(field.dataType))

        self.data = self.spark.createDataFrame(df.rdd, self.target_schema)

    def transform(self) -> DataFrame:
        """
        Returns:
            DataFrame: A dataframe with the raw data converted into MDM.
        """

        self.pre_transform_validation()
        self.data = self._get_transformed_df()
        self._convert_into_target_schema()
        self.post_transform_validation()

        return self.data

    def _add_uid_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("Uid", expr(self.uid_col))

    def _add_series_id_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("SeriesId", expr(self.series_id))

    def _add_timestamp_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("Timestamp", expr(self.timestamp_col))

    def _add_interval_timestamp_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("IntervalTimestamp", expr(self.interval_timestamp_col))

    def _add_value_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("Value", expr(self.value_col))

    def _add_series_parent_id_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("SeriesParentId", expr(self.series_parent_id))

    def _add_name_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("Name", expr(self.name))

    def _add_uom_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("Uom", expr(self.uom_col))

    def _add_description_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("Description", expr(self.description))

    def _add_timestamp_start_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("TimestampStart", expr(self.timestamp_start_col))

    def _add_timestamp_end_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("TimestampEnd", expr(self.timestamp_end_col))

    def _add_time_zone_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("Timezone", expr(self.time_zone_col))

    def _add_version_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("Version", expr(self.version))

    def _add_series_type_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("SeriesType", lit(self.series_type.value))

    def _add_model_type_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("ModelType", lit(self.model_type.value))

    def _add_value_type_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("ValueType", lit(self.value_type.value))

    def _add_properties_column(self, df: DataFrame) -> DataFrame:
        return df.withColumn("Properties", expr(self.properties_col))

    def _pre_process(self) -> DataFrame:
        return self.data

    @staticmethod
    def _post_process(df: DataFrame) -> DataFrame:
        return df

    def _get_usage_transformed_df(self) -> DataFrame:
        df = self._pre_process()

        df = self._add_uid_column(df)
        df = self._add_series_id_column(df)
        df = self._add_timestamp_column(df)
        df = self._add_interval_timestamp_column(df)
        df = self._add_value_column(df)

        df = self._post_process(df)

        return df

    def _get_meta_transformed_df(self) -> DataFrame:
        df = self._pre_process()

        df = self._add_uid_column(df)
        df = self._add_series_id_column(df)
        df = self._add_series_parent_id_column(df)
        df = self._add_name_column(df)
        df = self._add_uom_column(df)
        df = self._add_description_column(df)
        df = self._add_timestamp_start_column(df)
        df = self._add_timestamp_end_column(df)
        df = self._add_time_zone_column(df)
        df = self._add_version_column(df)
        df = self._add_series_type_column(df)
        df = self._add_model_type_column(df)
        df = self._add_value_type_column(df)
        df = self._add_properties_column(df)

        df = self._post_process(df)

        return df

transform()

Returns:

Name Type Description
DataFrame DataFrame

A dataframe with the raw data converted into MDM.

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/base_raw_to_mdm.py
153
154
155
156
157
158
159
160
161
162
163
164
def transform(self) -> DataFrame:
    """
    Returns:
        DataFrame: A dataframe with the raw data converted into MDM.
    """

    self.pre_transform_validation()
    self.data = self._get_transformed_df()
    self._convert_into_target_schema()
    self.post_transform_validation()

    return self.data
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/pjm_to_mdm.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
class PJMToMDMTransformer(BaseRawToMDMTransformer):
    """
    Converts PJM Raw data into Meters Data Model.

    Please check the BaseRawToMDMTransformer for the required arguments and methods.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import PJMToMDMTransformer
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    pjm_to_mdm_transformer = PJMToMDMTransformer(
        spark=spark,
        data=df,
        output_type="usage",
        name=None,
        description=None,
        value_type=None,
        version=None,
        series_id=None,
        series_parent_id=None
    )

    result = pjm_to_mdm_transformer.transform()
    ```

    BaseRawToMDMTransformer:
        ::: src.sdk.python.rtdip_sdk.pipelines.transformers.spark.base_raw_to_mdm
    """

    spark: SparkSession
    data: DataFrame
    input_schema = PJM_SCHEMA
    uid_col = "Zone"
    series_id_col = "'series_std_001'"
    timestamp_col = "to_utc_timestamp(StartTime, 'America/New_York')"
    interval_timestamp_col = "Timestamp + INTERVAL 1 HOURS"
    value_col = "bround(Load, 2)"
    series_parent_id_col = "'series_parent_std_001'"
    name_col = "'PJM API'"
    uom_col = "'mwh'"
    description_col = "'PJM data pulled from PJM ISO API'"
    timestamp_start_col = "StartTime"
    timestamp_end_col = "StartTime + INTERVAL 1 HOURS"
    time_zone_col = "'America/New_York'"
    version_col = "'1'"
    series_type = SeriesType.Hour
    model_type = ModelType.Default
    value_type = ValueType.Usage
    properties_col = "null"

EdgeXOPCUAJsonToPCDMTransformer

Bases: TransformerInterface

Converts a Spark Dataframe column containing a json string created by EdgeX to the Process Control Data Model.

Example

from rtdip_sdk.pipelines.transformers import EdgeXOPCUAJsonToPCDMTransformer

edge_opcua_json_to_pcdm_transformer = EdgeXOPCUAJsonToPCDMTransformer(
    data=df,
    souce_column_name="body",
    status_null_value="Good",
    change_type_value="insert"
)

result = edge_opcua_json_to_pcdm_transformer.transform()

Parameters:

Name Type Description Default
data DataFrame

Dataframe containing the column with EdgeX data

required
source_column_name str

Spark Dataframe column containing the OPC Publisher Json OPC UA data

required
status_null_value optional str

If populated, will replace 'Good' in the Status column with the specified value.

'Good'
change_type_value optional str

If populated, will replace 'insert' in the ChangeType column with the specified value.

'insert'
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/edgex_opcua_json_to_pcdm.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
class EdgeXOPCUAJsonToPCDMTransformer(TransformerInterface):
    """
    Converts a Spark Dataframe column containing a json string created by EdgeX to the Process Control Data Model.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import EdgeXOPCUAJsonToPCDMTransformer

    edge_opcua_json_to_pcdm_transformer = EdgeXOPCUAJsonToPCDMTransformer(
        data=df,
        souce_column_name="body",
        status_null_value="Good",
        change_type_value="insert"
    )

    result = edge_opcua_json_to_pcdm_transformer.transform()
    ```

    Parameters:
        data (DataFrame): Dataframe containing the column with EdgeX data
        source_column_name (str): Spark Dataframe column containing the OPC Publisher Json OPC UA data
        status_null_value (optional str): If populated, will replace 'Good' in the Status column with the specified value.
        change_type_value (optional str): If populated, will replace 'insert' in the ChangeType column with the specified value.
    """

    data: DataFrame
    source_column_name: str
    status_null_value: str
    change_type_value: str
    tagname_field: str

    def __init__(
        self,
        data: DataFrame,
        source_column_name: str,
        status_null_value: str = "Good",
        change_type_value: str = "insert",
        tagname_field="resourceName",
    ) -> None:
        self.data = data
        self.source_column_name = source_column_name
        self.status_null_value = status_null_value
        self.change_type_value = change_type_value
        self.tagname_field = tagname_field

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self):
        return True

    def transform(self) -> DataFrame:
        """
        Returns:
            DataFrame: A dataframe with the specified column converted to PCDM
        """
        df = (
            self.data.withColumn(
                self.source_column_name,
                from_json(self.source_column_name, EDGEX_SCHEMA),
            )
            .select("*", explode("{}.readings".format(self.source_column_name)))
            .selectExpr(
                "explode({}.readings.{}) as TagName".format(
                    self.source_column_name, self.tagname_field
                ),
                "to_utc_timestamp(to_timestamp((col.origin / 1000000000)), current_timezone()) as EventTime",
                "col.value as Value",
                "col.valueType as ValueType",
            )
            .withColumn("Status", lit(self.status_null_value))
            .withColumn("ChangeType", lit(self.change_type_value))
            .withColumn(
                "ValueType",
                (
                    when(col("ValueType") == "Int8", "integer")
                    .when(col("ValueType") == "Int16", "integer")
                    .when(col("ValueType") == "Int32", "integer")
                    .when(col("ValueType") == "Int64", "integer")
                    .when(col("ValueType") == "Uint8", "integer")
                    .when(col("ValueType") == "Uint16", "integer")
                    .when(col("ValueType") == "Uint32", "integer")
                    .when(col("ValueType") == "Uint64", "integer")
                    .when(col("ValueType") == "Float32", "float")
                    .when(col("ValueType") == "Float64", "float")
                    .when(col("ValueType") == "Bool", "bool")
                    .otherwise("string")
                ),
            )
        )

        return df.select(
            "TagName", "EventTime", "Status", "Value", "ValueType", "ChangeType"
        )

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/edgex_opcua_json_to_pcdm.py
69
70
71
72
73
74
75
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

transform()

Returns:

Name Type Description
DataFrame DataFrame

A dataframe with the specified column converted to PCDM

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/edgex_opcua_json_to_pcdm.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def transform(self) -> DataFrame:
    """
    Returns:
        DataFrame: A dataframe with the specified column converted to PCDM
    """
    df = (
        self.data.withColumn(
            self.source_column_name,
            from_json(self.source_column_name, EDGEX_SCHEMA),
        )
        .select("*", explode("{}.readings".format(self.source_column_name)))
        .selectExpr(
            "explode({}.readings.{}) as TagName".format(
                self.source_column_name, self.tagname_field
            ),
            "to_utc_timestamp(to_timestamp((col.origin / 1000000000)), current_timezone()) as EventTime",
            "col.value as Value",
            "col.valueType as ValueType",
        )
        .withColumn("Status", lit(self.status_null_value))
        .withColumn("ChangeType", lit(self.change_type_value))
        .withColumn(
            "ValueType",
            (
                when(col("ValueType") == "Int8", "integer")
                .when(col("ValueType") == "Int16", "integer")
                .when(col("ValueType") == "Int32", "integer")
                .when(col("ValueType") == "Int64", "integer")
                .when(col("ValueType") == "Uint8", "integer")
                .when(col("ValueType") == "Uint16", "integer")
                .when(col("ValueType") == "Uint32", "integer")
                .when(col("ValueType") == "Uint64", "integer")
                .when(col("ValueType") == "Float32", "float")
                .when(col("ValueType") == "Float64", "float")
                .when(col("ValueType") == "Bool", "bool")
                .otherwise("string")
            ),
        )
    )

    return df.select(
        "TagName", "EventTime", "Status", "Value", "ValueType", "ChangeType"
    )

ECMWFExtractBaseToWeatherDataModel

Bases: TransformerInterface

Base class for extracting forecast data downloaded in .nc format from ECMWF MARS Server.

Parameters:

Name Type Description Default
load_path str

Path to local directory where the nc files will be stored, in format "yyyy-mm-dd_HH.nc"

required
date_start str

Start date of extraction in "YYYY-MM-DD HH:MM:SS" format

required
date_end str

End date of extraction in "YYYY-MM-DD HH:MM:SS" format

required
run_frequency str

Frequency format of runs to download, e.g. "H"

required
run_interval str

Interval of runs, e.g. a run_frequency of "H" and run_interval of "12" will extract the data of the 00 and 12 run for each day.

required
lat DataArray

Latitude values to extract from nc files

required
lon DataArray

Longitude values to extract from nc files

required
utc bool = True

Whether to convert the time to UTC or not

True
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/ecmwf/nc_extractbase_to_weather_data_model.py
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
class ECMWFExtractBaseToWeatherDataModel(TransformerInterface):
    """
    Base class for extracting forecast data downloaded in .nc format from ECMWF MARS Server.

    Args:
        load_path (str): Path to local directory where the nc files will be stored, in format "yyyy-mm-dd_HH.nc"
        date_start (str): Start date of extraction in "YYYY-MM-DD HH:MM:SS" format
        date_end (str): End date of extraction in "YYYY-MM-DD HH:MM:SS" format
        run_frequency (str):Frequency format of runs to download, e.g. "H"
        run_interval (str): Interval of runs, e.g. a run_frequency of "H" and run_interval of "12" will extract the data of the 00 and 12 run for each day.
        lat (DataArray): Latitude values to extract from nc files
        lon (DataArray): Longitude values to extract from nc files
        utc (bool = True): Whether to convert the time to UTC or not
    """

    def __init__(
        self,
        load_path: str,
        date_start: str,
        date_end: str,
        run_interval: str,
        run_frequency: str,
        lat: xr.DataArray,
        lon: xr.DataArray,
        utc: bool = True,
    ):
        self.load_path = load_path
        self.lat = lat
        self.lon = lon
        self.date_start = date_start
        self.date_end = date_end
        self.run_frequency = run_frequency
        self.run_interval = run_interval
        self.utc = utc
        self.dates = pd.date_range(
            start=self.date_start,
            end=self.date_end,
            freq=self.run_interval + self.run_frequency,
        )

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self):
        return True

    @staticmethod
    def _convert_ws_tag_names(x: list):
        """
        Converts the tag names of wind speed from the format used in the nc files to the format used in the weather data model.

        Args:
            x (list): List of variable names of raw tags to be extracted from the nc files

        Returns:
            new_tags(list): List of variable names of raw tags to be extracted from the nc files, converted to the format used in the weather data model.
        """
        convert_dict = {
            "10u": "u10",
            "100u": "u100",
            "200u": "u200",
            "10v": "v10",
            "100v": "v100",
            "200v": "v200",
        }
        new_tags = [convert_dict[i] if i in convert_dict.keys() else i for i in x]
        return new_tags

    def transform(
        self, tag_prefix: str, variables: list, method: str = "nearest"
    ) -> pd.DataFrame:
        """Extract raw data from stored nc filed downloaded via ECMWF MARS.

        Args:
            tag_prefix (str): Prefix of the tag names of raw tags to be added to the dataframe
            variables (list): List of variable names of raw tags to be extracted from the nc files
            method (str, optional): The method used to match latitude/longitude in xarray using .sel(), by default "nearest"

        Returns:
            df (pd.DataFrame): Raw data extracted with lat, lon, run_time, target_time as a pd.multiindex and variables as columns.
        """
        df = []
        # e.g. 10u variable is saved as u10 in the file...
        vars_processed = self._convert_ws_tag_names(variables)

        for i in self.dates:
            filename = f"{str(i.date())}_{i.hour:02}.nc"
            fullpath = os.path.join(self.load_path, filename)
            ds = xr.open_dataset(fullpath)
            tmp = (
                ds[vars_processed]
                .sel(latitude=self.lat, longitude=self.lon, method=method)
                .to_dataframe()
            )
            tmp["run_time"] = i
            df.append(tmp)
            ds.close()

        df = pd.concat(df, axis=0)

        df = df.rename_axis(
            index={
                "time": "target_time",
                "latitude": "lat",
                "longitude": "lon",
            }
        )

        df = df.reset_index(["lat", "lon"])
        df[["lat", "lon"]] = df[["lat", "lon"]].apply(
            lambda x: np.round(x.astype(float), 5)
        )

        if "level" in df.index.names:
            index_names = ["lat", "lon", "level", "run_time", "target_time"]
        else:
            index_names = ["lat", "lon", "run_time", "target_time"]
        df = df.reset_index().set_index(index_names)

        if self.utc:
            df = df.tz_localize("UTC", level="target_time")
            df = df.tz_localize("UTC", level="run_time")

        df = df[~(df.index.duplicated(keep="first"))]
        df = df.sort_index(axis=0)
        df = df.sort_index(axis=1)

        df_new = df.reset_index()

        df_new = df_new.rename(
            columns={
                "lat": "Latitude",
                "lon": "Longitude",
                "run_time": "EnqueuedTime",
                "target_time": "EventTime",
            }
        )

        df_new = (
            df_new.set_index(["Latitude", "Longitude", "EnqueuedTime", "EventTime"])[
                vars_processed
            ]
            .rename_axis("Measure", axis=1)
            .stack()
            .reset_index(name="Value")
        )

        df_new["Source"] = "ECMWF_MARS"
        df_new["Status"] = "Good"
        df_new["Latest"] = True
        df_new["EventDate"] = pd.to_datetime(df_new["EventTime"]).dt.date
        df_new["TagName"] = (
            tag_prefix
            + df_new["Latitude"].astype(str)
            + "_"
            + df_new["Longitude"].astype(str)
            + "_"
            + df_new["Source"]
            + "_"
            + df_new["Measure"]
        )
        df_final = df_new.drop("Measure", axis=1)

        return df_final

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/ecmwf/nc_extractbase_to_weather_data_model.py
68
69
70
71
72
73
74
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

transform(tag_prefix, variables, method='nearest')

Extract raw data from stored nc filed downloaded via ECMWF MARS.

Parameters:

Name Type Description Default
tag_prefix str

Prefix of the tag names of raw tags to be added to the dataframe

required
variables list

List of variable names of raw tags to be extracted from the nc files

required
method str

The method used to match latitude/longitude in xarray using .sel(), by default "nearest"

'nearest'

Returns:

Name Type Description
df DataFrame

Raw data extracted with lat, lon, run_time, target_time as a pd.multiindex and variables as columns.

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/ecmwf/nc_extractbase_to_weather_data_model.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
def transform(
    self, tag_prefix: str, variables: list, method: str = "nearest"
) -> pd.DataFrame:
    """Extract raw data from stored nc filed downloaded via ECMWF MARS.

    Args:
        tag_prefix (str): Prefix of the tag names of raw tags to be added to the dataframe
        variables (list): List of variable names of raw tags to be extracted from the nc files
        method (str, optional): The method used to match latitude/longitude in xarray using .sel(), by default "nearest"

    Returns:
        df (pd.DataFrame): Raw data extracted with lat, lon, run_time, target_time as a pd.multiindex and variables as columns.
    """
    df = []
    # e.g. 10u variable is saved as u10 in the file...
    vars_processed = self._convert_ws_tag_names(variables)

    for i in self.dates:
        filename = f"{str(i.date())}_{i.hour:02}.nc"
        fullpath = os.path.join(self.load_path, filename)
        ds = xr.open_dataset(fullpath)
        tmp = (
            ds[vars_processed]
            .sel(latitude=self.lat, longitude=self.lon, method=method)
            .to_dataframe()
        )
        tmp["run_time"] = i
        df.append(tmp)
        ds.close()

    df = pd.concat(df, axis=0)

    df = df.rename_axis(
        index={
            "time": "target_time",
            "latitude": "lat",
            "longitude": "lon",
        }
    )

    df = df.reset_index(["lat", "lon"])
    df[["lat", "lon"]] = df[["lat", "lon"]].apply(
        lambda x: np.round(x.astype(float), 5)
    )

    if "level" in df.index.names:
        index_names = ["lat", "lon", "level", "run_time", "target_time"]
    else:
        index_names = ["lat", "lon", "run_time", "target_time"]
    df = df.reset_index().set_index(index_names)

    if self.utc:
        df = df.tz_localize("UTC", level="target_time")
        df = df.tz_localize("UTC", level="run_time")

    df = df[~(df.index.duplicated(keep="first"))]
    df = df.sort_index(axis=0)
    df = df.sort_index(axis=1)

    df_new = df.reset_index()

    df_new = df_new.rename(
        columns={
            "lat": "Latitude",
            "lon": "Longitude",
            "run_time": "EnqueuedTime",
            "target_time": "EventTime",
        }
    )

    df_new = (
        df_new.set_index(["Latitude", "Longitude", "EnqueuedTime", "EventTime"])[
            vars_processed
        ]
        .rename_axis("Measure", axis=1)
        .stack()
        .reset_index(name="Value")
    )

    df_new["Source"] = "ECMWF_MARS"
    df_new["Status"] = "Good"
    df_new["Latest"] = True
    df_new["EventDate"] = pd.to_datetime(df_new["EventTime"]).dt.date
    df_new["TagName"] = (
        tag_prefix
        + df_new["Latitude"].astype(str)
        + "_"
        + df_new["Longitude"].astype(str)
        + "_"
        + df_new["Source"]
        + "_"
        + df_new["Measure"]
    )
    df_final = df_new.drop("Measure", axis=1)

    return df_final

ECMWFExtractGridToWeatherDataModel

Bases: ECMWFExtractBaseToWeatherDataModel

Extract a grid from a local .nc file downloaded from ECMWF via MARS

Parameters:

Name Type Description Default
lat_min float

Minimum latitude of grid to extract

required
lat_max float

Maximum latitude of grid to extract

required
lon_min float

Minimum longitude of grid to extract

required
lon_max float

Maximum longitude of grid to extract

required
grid_step float

The grid length to use to define the grid, e.g. 0.1.

required
load_path str

Path to local directory with nc files downloaded in format "yyyy-mm-dd_HH.nc"

required
date_start str

Start date of extraction in "YYYY-MM-DD HH:MM:SS" format

required
date_end str

End date of extraction in "YYYY-MM-DD HH:MM:SS" format

required
run_frequency str

Frequency format of runs to download, e.g. "H"

required
run_interval str

Interval of runs, e.g. a run_frequency of "H" and run_interval of "12" will extract the data of the 00 and 12 run for each day.

required
utc bool

Add utc to the datetime indexes? Defaults to True.

True
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/ecmwf/nc_extractgrid_to_weather_data_model.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
class ECMWFExtractGridToWeatherDataModel(ECMWFExtractBaseToWeatherDataModel):
    """Extract a grid from a local .nc file downloaded from ECMWF via MARS

    Args:
        lat_min (float): Minimum latitude of grid to extract
        lat_max (float): Maximum latitude of grid to extract
        lon_min (float): Minimum longitude of grid to extract
        lon_max (float): Maximum longitude of grid to extract
        grid_step (float): The grid length to use to define the grid, e.g. 0.1.
        load_path (str): Path to local directory with nc files downloaded in format "yyyy-mm-dd_HH.nc"
        date_start (str): Start date of extraction in "YYYY-MM-DD HH:MM:SS" format
        date_end (str): End date of extraction in "YYYY-MM-DD HH:MM:SS" format
        run_frequency (str): Frequency format of runs to download, e.g. "H"
        run_interval (str): Interval of runs, e.g. a run_frequency of "H" and run_interval of "12" will extract the data of the 00 and 12 run for each day.
        utc (bool, optional): Add utc to the datetime indexes? Defaults to True.

    """

    def __init__(
        self,
        lat_min: float,
        lat_max: float,
        lon_min: float,
        lon_max: float,
        grid_step: float,
        load_path: str,
        date_start: str,
        date_end: str,
        run_interval: str,
        run_frequency: str,
        utc: bool = True,
    ):
        # hmm careful with floating points, this seems to work ok...
        lat_xr = xr.DataArray(
            np.linspace(
                lat_min, lat_max, int(np.round((lat_max - lat_min) / grid_step)) + 1
            ),
            dims=["latitude"],
        )
        lon_xr = xr.DataArray(
            np.linspace(
                lon_min, lon_max, int(np.round((lon_max - lon_min) / grid_step)) + 1
            ),
            dims=["longitude"],
        )

        self.load_path = load_path
        self.lat_min = lat_min
        self.lat_max = lat_max
        self.lon_min = lon_min
        self.lon_max = lon_max
        self.grid_step = grid_step
        self.lat = lat_xr
        self.lon = lon_xr
        self.date_start = date_start
        self.date_end = date_end
        self.run_frequency = run_frequency
        self.run_interval = run_interval
        self.utc = utc

        super(ECMWFExtractGridToWeatherDataModel, self).__init__(
            lat=lat_xr,
            lon=lon_xr,
            load_path=load_path,
            date_start=date_start,
            date_end=date_end,
            run_interval=run_interval,
            run_frequency=run_frequency,
            utc=utc,
        )

ECMWFExtractPointToWeatherDataModel

Bases: ECMWFExtractBaseToWeatherDataModel

Extract a single point from a local .nc file downloaded from ECMWF via MARS

Parameters:

Name Type Description Default
lat float

Latitude of point to extract

required
lon float

Longitude of point to extract

required
load_path str

Path to local directory with nc files downloaded in format "yyyy-mm-dd_HH.nc"

required
date_start str

Start date of extraction in "YYYY-MM-DD HH:MM:SS" format

required
date_end str

End date of extraction in "YYYY-MM-DD HH:MM:SS" format

required
run_frequency str

Frequency format of runs to download, e.g. "H"

required
run_interval str

Interval of runs, e.g. a run_frequency of "H" and run_interval of "12" will extract the data of the 00 and 12 run for each day.

required
utc bool

Add utc to the datetime indexes? Defaults to True.

True
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/ecmwf/nc_extractpoint_to_weather_data_model.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
class ECMWFExtractPointToWeatherDataModel(ECMWFExtractBaseToWeatherDataModel):
    """
    Extract a single point from a local .nc file downloaded from ECMWF via MARS

    Args:
        lat (float): Latitude of point to extract
        lon (float): Longitude of point to extract
        load_path (str): Path to local directory with nc files downloaded in format "yyyy-mm-dd_HH.nc"
        date_start (str): Start date of extraction in "YYYY-MM-DD HH:MM:SS" format
        date_end (str): End date of extraction in "YYYY-MM-DD HH:MM:SS" format
        run_frequency (str): Frequency format of runs to download, e.g. "H"
        run_interval (str): Interval of runs, e.g. a run_frequency of "H" and run_interval of "12" will extract the data of the 00 and 12 run for each day.
        utc (bool, optional): Add utc to the datetime indexes? Defaults to True.
    """

    def __init__(
        self,
        lat: float,
        lon: float,
        load_path: str,
        date_start: str,
        date_end: str,
        run_interval: str,
        run_frequency: str,
        utc: bool = True,
    ):
        lat_xr = xr.DataArray([lat], dims=["latitude"])
        lon_xr = xr.DataArray([lon], dims=["longitude"])

        self.lat = lat_xr
        self.lon = lon_xr
        self.load_path = load_path
        self.date_start = date_start
        self.date_end = date_end
        self.run_frequency = run_frequency
        self.run_interval = run_interval
        self.utc = utc

        super(ECMWFExtractPointToWeatherDataModel, self).__init__(
            lat=lat_xr,
            lon=lon_xr,
            load_path=load_path,
            date_start=date_start,
            date_end=date_end,
            run_interval=run_interval,
            run_frequency=run_frequency,
            utc=utc,
        )

RawForecastToWeatherDataModel

Bases: TransformerInterface

Converts a raw forecast into weather data model.

Parameters:

Name Type Description Default
spark SparkSession

Spark Session instance.

required
data DataFrame

Dataframe to be transformed

required
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/the_weather_company/raw_forecast_to_weather_data_model.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
class RawForecastToWeatherDataModel(TransformerInterface):
    """
    Converts a raw forecast into weather data model.

    Parameters:
        spark (SparkSession): Spark Session instance.
        data (DataFrame): Dataframe to be transformed
    """

    spark: SparkSession
    data: DataFrame

    def __init__(
        self,
        spark: SparkSession,
        data: DataFrame,
    ) -> None:
        self.spark = spark
        self.data = data
        self.target_schema = WEATHER_DATA_MODEL

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self) -> bool:
        assert str(self.data.schema) == str(self.target_schema)
        return True

    def _convert_into_target_schema(self) -> None:
        """
        Converts a Spark DataFrame structure into new structure based on the Target Schema.

        Returns: Nothing.

        """

        df: DataFrame = self.data
        df = df.select(self.target_schema.names)

        for field in self.target_schema.fields:
            df = df.withColumn(field.name, col(field.name).cast(field.dataType))

        self.data = self.spark.createDataFrame(df.rdd, self.target_schema)

    def transform(self) -> DataFrame:
        """
        Returns:
            DataFrame: A Forecast dataframe converted into Weather Data Model
        """

        self.pre_transform_validation()

        processed_date = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")

        df = (
            self.data.withColumn("WeatherDay", substring("FcstValidLocal", 0, 10))
            .withColumn(
                "WeatherHour",
                (substring("FcstValidLocal", 12, 2).cast(IntegerType()) + 1),
            )
            .withColumn("WeatherTimezoneOffset", substring("FcstValidLocal", 20, 5))
            .withColumn("WeatherType", lit("F"))
            .withColumn("ProcessedDate", lit(processed_date))
            .withColumnRenamed("Temp", "Temperature")
            .withColumnRenamed("Dewpt", "DewPoint")
            .withColumnRenamed("Rh", "Humidity")
            .withColumnRenamed("Hi", "HeatIndex")
            .withColumnRenamed("Wc", "WindChill")
            .withColumnRenamed("Wdir", "WindDirection")
            .withColumnRenamed("Wspd", "WindSpeed")
            .withColumnRenamed("Clds", "CloudCover")
            .withColumn("WetBulbTemp", lit(""))
            .withColumn("SolarIrradiance", lit(""))
            .withColumnRenamed("Qpf", "Precipitation")
            .withColumnRenamed("DayInd", "DayOrNight")
            .withColumnRenamed("Dow", "DayOfWeek")
            .withColumnRenamed("Gust", "WindGust")
            .withColumnRenamed("Mslp", "MslPressure")
            .withColumnRenamed("Num", "ForecastDayNum")
            .withColumnRenamed("Pop", "PropOfPrecip")
            .withColumnRenamed("PrecipType", "PrecipType")
            .withColumnRenamed("SnowQpf", "SnowAccumulation")
            .withColumnRenamed("UvIndex", "UvIndex")
            .withColumnRenamed("Vis", "Visibility")
        )

        columns = df.columns
        for column in columns:
            df = df.withColumn(
                column, when(col(column) == "", lit(None)).otherwise(col(column))
            )

        self.data = df
        self._convert_into_target_schema()
        self.post_transform_validation()

        return self.data

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/the_weather_company/raw_forecast_to_weather_data_model.py
46
47
48
49
50
51
52
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

transform()

Returns:

Name Type Description
DataFrame DataFrame

A Forecast dataframe converted into Weather Data Model

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/the_weather_company/raw_forecast_to_weather_data_model.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def transform(self) -> DataFrame:
    """
    Returns:
        DataFrame: A Forecast dataframe converted into Weather Data Model
    """

    self.pre_transform_validation()

    processed_date = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")

    df = (
        self.data.withColumn("WeatherDay", substring("FcstValidLocal", 0, 10))
        .withColumn(
            "WeatherHour",
            (substring("FcstValidLocal", 12, 2).cast(IntegerType()) + 1),
        )
        .withColumn("WeatherTimezoneOffset", substring("FcstValidLocal", 20, 5))
        .withColumn("WeatherType", lit("F"))
        .withColumn("ProcessedDate", lit(processed_date))
        .withColumnRenamed("Temp", "Temperature")
        .withColumnRenamed("Dewpt", "DewPoint")
        .withColumnRenamed("Rh", "Humidity")
        .withColumnRenamed("Hi", "HeatIndex")
        .withColumnRenamed("Wc", "WindChill")
        .withColumnRenamed("Wdir", "WindDirection")
        .withColumnRenamed("Wspd", "WindSpeed")
        .withColumnRenamed("Clds", "CloudCover")
        .withColumn("WetBulbTemp", lit(""))
        .withColumn("SolarIrradiance", lit(""))
        .withColumnRenamed("Qpf", "Precipitation")
        .withColumnRenamed("DayInd", "DayOrNight")
        .withColumnRenamed("Dow", "DayOfWeek")
        .withColumnRenamed("Gust", "WindGust")
        .withColumnRenamed("Mslp", "MslPressure")
        .withColumnRenamed("Num", "ForecastDayNum")
        .withColumnRenamed("Pop", "PropOfPrecip")
        .withColumnRenamed("PrecipType", "PrecipType")
        .withColumnRenamed("SnowQpf", "SnowAccumulation")
        .withColumnRenamed("UvIndex", "UvIndex")
        .withColumnRenamed("Vis", "Visibility")
    )

    columns = df.columns
    for column in columns:
        df = df.withColumn(
            column, when(col(column) == "", lit(None)).otherwise(col(column))
        )

    self.data = df
    self._convert_into_target_schema()
    self.post_transform_validation()

    return self.data

PCDMToHoneywellAPMTransformer

Bases: TransformerInterface

Converts a Spark Dataframe in PCDM format to Honeywell APM format.

Example

from rtdip_sdk.pipelines.transformers import PCDMToHoneywellAPMTransformer

pcdm_to_honeywell_apm_transformer = PCDMToHoneywellAPMTransformer(
    data=df,
    quality="Good",
    history_samples_per_message=1,
    compress_payload=True
)

result = pcdm_to_honeywell_apm_transformer.transform()

Parameters:

Name Type Description Default
data Dataframe

Spark Dataframe in PCDM format

required
quality str

Value for quality inside HistorySamples

'Good'
history_samples_per_message int

The number of HistorySamples for each row in the DataFrame (Batch Only)

1
compress_payload bool

If True compresses CloudPlatformEvent with gzip compression

True
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
class PCDMToHoneywellAPMTransformer(TransformerInterface):
    """
    Converts a Spark Dataframe in PCDM format to Honeywell APM format.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import PCDMToHoneywellAPMTransformer

    pcdm_to_honeywell_apm_transformer = PCDMToHoneywellAPMTransformer(
        data=df,
        quality="Good",
        history_samples_per_message=1,
        compress_payload=True
    )

    result = pcdm_to_honeywell_apm_transformer.transform()
    ```

    Parameters:
        data (Dataframe): Spark Dataframe in PCDM format
        quality (str): Value for quality inside HistorySamples
        history_samples_per_message (int): The number of HistorySamples for each row in the DataFrame (Batch Only)
        compress_payload (bool): If True compresses CloudPlatformEvent with gzip compression
    """

    data: DataFrame
    quality: str
    history_samples_per_message: int
    compress_payload: bool

    def __init__(
        self,
        data: DataFrame,
        quality: str = "Good",
        history_samples_per_message: int = 1,
        compress_payload: bool = True,
    ) -> None:
        self.data = data
        self.quality = quality
        self.history_samples_per_message = history_samples_per_message
        self.compress_payload = compress_payload

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self):
        return True

    def transform(self) -> DataFrame:
        """
        Returns:
            DataFrame: A dataframe with with rows in Honeywell APM format
        """

        @udf("string")
        def _compress_payload(data):
            compressed_data = gzip.compress(data.encode("utf-8"))
            encoded_data = base64.b64encode(compressed_data).decode("utf-8")
            return encoded_data

        if self.data.isStreaming == False and self.history_samples_per_message > 1:
            w = Window.partitionBy("TagName").orderBy("TagName")
            cleaned_pcdm_df = (
                self.data.withColumn(
                    "index",
                    floor(
                        (row_number().over(w) - 0.01) / self.history_samples_per_message
                    ),
                )
                .withColumn(
                    "HistorySamples",
                    struct(
                        col("TagName").alias("ItemName"),
                        lit(self.quality).alias("Quality"),
                        col("EventTime").alias("Time"),
                        col("Value").alias("Value"),
                    ).alias("HistorySamples"),
                )
                .groupBy("TagName", "index")
                .agg(collect_list("HistorySamples").alias("HistorySamples"))
                .withColumn("guid", sha2(col("TagName"), 256).cast("string"))
                .withColumn(
                    "value",
                    struct(
                        col("guid").alias("SystemGuid"), col("HistorySamples")
                    ).alias("value"),
                )
            )
        else:
            cleaned_pcdm_df = self.data.withColumn(
                "guid", sha2(col("TagName"), 256).cast("string")
            ).withColumn(
                "value",
                struct(
                    col("guid").alias("SystemGuid"),
                    array(
                        struct(
                            col("TagName").alias("ItemName"),
                            lit(self.quality).alias("Quality"),
                            col("EventTime").alias("Time"),
                            col("Value").alias("Value"),
                        ),
                    ).alias("HistorySamples"),
                ),
            )

        df = (
            cleaned_pcdm_df.withColumn(
                "CloudPlatformEvent",
                struct(
                    lit(datetime.now(tz=pytz.UTC)).alias("CreatedTime"),
                    lit(expr("uuid()")).alias("Id"),
                    col("guid").alias("CreatorId"),
                    lit("CloudPlatformSystem").alias("CreatorType"),
                    lit(None).alias("GeneratorId"),
                    lit("CloudPlatformTenant").alias("GeneratorType"),
                    col("guid").alias("TargetId"),
                    lit("CloudPlatformTenant").alias("TargetType"),
                    lit(None).alias("TargetContext"),
                    struct(
                        lit("TextualBody").alias("type"),
                        to_json(col("value")).alias("value"),
                        lit("application/json").alias("format"),
                    ).alias("Body"),
                    array(
                        struct(
                            lit("SystemType").alias("Key"),
                            lit("apm-system").alias("Value"),
                        ),
                        struct(
                            lit("SystemGuid").alias("Key"), col("guid").alias("Value")
                        ),
                    ).alias("BodyProperties"),
                    lit("DataChange.Update").alias("EventType"),
                ),
            )
            .withColumn("AnnotationStreamIds", lit(","))
            .withColumn("partitionKey", col("guid"))
        )
        if self.compress_payload:
            return df.select(
                _compress_payload(to_json("CloudPlatformEvent")).alias(
                    "CloudPlatformEvent"
                ),
                "AnnotationStreamIds",
                "partitionKey",
            )
        else:
            return df.select(
                "CloudPlatformEvent", "AnnotationStreamIds", "partitionKey"
            )

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py
83
84
85
86
87
88
89
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

transform()

Returns:

Name Type Description
DataFrame DataFrame

A dataframe with with rows in Honeywell APM format

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
def transform(self) -> DataFrame:
    """
    Returns:
        DataFrame: A dataframe with with rows in Honeywell APM format
    """

    @udf("string")
    def _compress_payload(data):
        compressed_data = gzip.compress(data.encode("utf-8"))
        encoded_data = base64.b64encode(compressed_data).decode("utf-8")
        return encoded_data

    if self.data.isStreaming == False and self.history_samples_per_message > 1:
        w = Window.partitionBy("TagName").orderBy("TagName")
        cleaned_pcdm_df = (
            self.data.withColumn(
                "index",
                floor(
                    (row_number().over(w) - 0.01) / self.history_samples_per_message
                ),
            )
            .withColumn(
                "HistorySamples",
                struct(
                    col("TagName").alias("ItemName"),
                    lit(self.quality).alias("Quality"),
                    col("EventTime").alias("Time"),
                    col("Value").alias("Value"),
                ).alias("HistorySamples"),
            )
            .groupBy("TagName", "index")
            .agg(collect_list("HistorySamples").alias("HistorySamples"))
            .withColumn("guid", sha2(col("TagName"), 256).cast("string"))
            .withColumn(
                "value",
                struct(
                    col("guid").alias("SystemGuid"), col("HistorySamples")
                ).alias("value"),
            )
        )
    else:
        cleaned_pcdm_df = self.data.withColumn(
            "guid", sha2(col("TagName"), 256).cast("string")
        ).withColumn(
            "value",
            struct(
                col("guid").alias("SystemGuid"),
                array(
                    struct(
                        col("TagName").alias("ItemName"),
                        lit(self.quality).alias("Quality"),
                        col("EventTime").alias("Time"),
                        col("Value").alias("Value"),
                    ),
                ).alias("HistorySamples"),
            ),
        )

    df = (
        cleaned_pcdm_df.withColumn(
            "CloudPlatformEvent",
            struct(
                lit(datetime.now(tz=pytz.UTC)).alias("CreatedTime"),
                lit(expr("uuid()")).alias("Id"),
                col("guid").alias("CreatorId"),
                lit("CloudPlatformSystem").alias("CreatorType"),
                lit(None).alias("GeneratorId"),
                lit("CloudPlatformTenant").alias("GeneratorType"),
                col("guid").alias("TargetId"),
                lit("CloudPlatformTenant").alias("TargetType"),
                lit(None).alias("TargetContext"),
                struct(
                    lit("TextualBody").alias("type"),
                    to_json(col("value")).alias("value"),
                    lit("application/json").alias("format"),
                ).alias("Body"),
                array(
                    struct(
                        lit("SystemType").alias("Key"),
                        lit("apm-system").alias("Value"),
                    ),
                    struct(
                        lit("SystemGuid").alias("Key"), col("guid").alias("Value")
                    ),
                ).alias("BodyProperties"),
                lit("DataChange.Update").alias("EventType"),
            ),
        )
        .withColumn("AnnotationStreamIds", lit(","))
        .withColumn("partitionKey", col("guid"))
    )
    if self.compress_payload:
        return df.select(
            _compress_payload(to_json("CloudPlatformEvent")).alias(
                "CloudPlatformEvent"
            ),
            "AnnotationStreamIds",
            "partitionKey",
        )
    else:
        return df.select(
            "CloudPlatformEvent", "AnnotationStreamIds", "partitionKey"
        )

HoneywellAPMJsonToPCDMTransformer

Bases: TransformerInterface

Converts a Spark Dataframe column containing a json string created by Honeywell APM to the Process Control Data Model.

Example

from rtdip_sdk.pipelines.transformers import HoneywellAPMJsonToPCDMTransformer

honeywell_apm_json_to_pcdm_transformer = HoneywellAPMJsonToPCDMTransformer(
    data=df,
    souce_column_name="body",
    status_null_value="Good",
    change_type_value="insert"
)

result = honeywell_apm_json_to_pcdm_transformer.transform()

Parameters:

Name Type Description Default
data DataFrame

Dataframe containing the column with EdgeX data

required
source_column_name str

Spark Dataframe column containing the OPC Publisher Json OPC UA data

required
status_null_value optional str

If populated, will replace 'Good' in the Status column with the specified value.

'Good'
change_type_value optional str

If populated, will replace 'insert' in the ChangeType column with the specified value.

'insert'
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/honeywell_apm_to_pcdm.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
class HoneywellAPMJsonToPCDMTransformer(TransformerInterface):
    """
    Converts a Spark Dataframe column containing a json string created by Honeywell APM to the Process Control Data Model.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import HoneywellAPMJsonToPCDMTransformer

    honeywell_apm_json_to_pcdm_transformer = HoneywellAPMJsonToPCDMTransformer(
        data=df,
        souce_column_name="body",
        status_null_value="Good",
        change_type_value="insert"
    )

    result = honeywell_apm_json_to_pcdm_transformer.transform()
    ```

    Parameters:
        data (DataFrame): Dataframe containing the column with EdgeX data
        source_column_name (str): Spark Dataframe column containing the OPC Publisher Json OPC UA data
        status_null_value (optional str): If populated, will replace 'Good' in the Status column with the specified value.
        change_type_value (optional str): If populated, will replace 'insert' in the ChangeType column with the specified value.
    """

    data: DataFrame
    source_column_name: str
    status_null_value: str
    change_type_value: str

    def __init__(
        self,
        data: DataFrame,
        source_column_name: str,
        status_null_value: str = "Good",
        change_type_value: str = "insert",
    ) -> None:
        self.data = data
        self.source_column_name = source_column_name
        self.status_null_value = status_null_value
        self.change_type_value = change_type_value

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self):
        return True

    def transform(self) -> DataFrame:
        """
        Returns:
            DataFrame: A dataframe with the specified column converted to PCDM
        """
        df = (
            self.data.withColumn("body", from_json(self.source_column_name, APM_SCHEMA))
            .select(explode("body.SystemTimeSeries.Samples"))
            .selectExpr("*", "to_timestamp(col.Time) as EventTime")
            .withColumn("TagName", col("col.Itemname"))
            .withColumn("Status", lit(self.status_null_value))
            .withColumn("Value", col("col.Value"))
            .withColumn(
                "ValueType",
                when(col("value").cast("float").isNotNull(), "float").when(
                    col("value").cast("float").isNull(), "string"
                ),
            )
            .withColumn("ChangeType", lit(self.change_type_value))
        )

        return df.select(
            "TagName", "EventTime", "Status", "Value", "ValueType", "ChangeType"
        )

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/honeywell_apm_to_pcdm.py
66
67
68
69
70
71
72
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

transform()

Returns:

Name Type Description
DataFrame DataFrame

A dataframe with the specified column converted to PCDM

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/honeywell_apm_to_pcdm.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def transform(self) -> DataFrame:
    """
    Returns:
        DataFrame: A dataframe with the specified column converted to PCDM
    """
    df = (
        self.data.withColumn("body", from_json(self.source_column_name, APM_SCHEMA))
        .select(explode("body.SystemTimeSeries.Samples"))
        .selectExpr("*", "to_timestamp(col.Time) as EventTime")
        .withColumn("TagName", col("col.Itemname"))
        .withColumn("Status", lit(self.status_null_value))
        .withColumn("Value", col("col.Value"))
        .withColumn(
            "ValueType",
            when(col("value").cast("float").isNotNull(), "float").when(
                col("value").cast("float").isNull(), "string"
            ),
        )
        .withColumn("ChangeType", lit(self.change_type_value))
    )

    return df.select(
        "TagName", "EventTime", "Status", "Value", "ValueType", "ChangeType"
    )

SEMJsonToPCDMTransformer

Bases: TransformerInterface

Converts a Spark Dataframe column containing a json string created by SEM to the Process Control Data Model.

Example

from rtdip_sdk.pipelines.transformers import SEMJsonToPCDMTransformer

sem_json_to_pcdm_transformer = SEMJsonToPCDMTransformer(
    data=df
    source_column_name="body",
    version=10,
    status_null_value="Good",
    change_type_value="insert"
)

result = sem_json_to_pcdm_transformer.transform()

Parameters:

Name Type Description Default
data DataFrame

Dataframe containing the column with SEM data

required
source_column_name str

Spark Dataframe column containing the Json SEM data

required
version int

The version for the OBC field mappings. The latest version is 10.

required
status_null_value optional str

If populated, will replace 'Good' in the Status column with the specified value.

'Good'
change_type_value optional str

If populated, will replace 'insert' in the ChangeType column with the specified value.

'insert'
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/sem_json_to_pcdm.py
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
class SEMJsonToPCDMTransformer(TransformerInterface):
    """
    Converts a Spark Dataframe column containing a json string created by SEM to the Process Control Data Model.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import SEMJsonToPCDMTransformer

    sem_json_to_pcdm_transformer = SEMJsonToPCDMTransformer(
        data=df
        source_column_name="body",
        version=10,
        status_null_value="Good",
        change_type_value="insert"
    )

    result = sem_json_to_pcdm_transformer.transform()
    ```

    Parameters:
        data (DataFrame): Dataframe containing the column with SEM data
        source_column_name (str): Spark Dataframe column containing the Json SEM data
        version (int): The version for the OBC field mappings. The latest version is 10.
        status_null_value (optional str): If populated, will replace 'Good' in the Status column with the specified value.
        change_type_value (optional str): If populated, will replace 'insert' in the ChangeType column with the specified value.
    """

    data: DataFrame
    source_column_name: str
    version: int
    status_null_value: str
    change_type_value: str

    def __init__(
        self,
        data: DataFrame,
        source_column_name: str,
        version: int,
        status_null_value: str = "Good",
        change_type_value: str = "insert",
    ) -> None:
        _package_version_meets_minimum("pyspark", "3.4.0")
        self.data = data
        self.source_column_name = source_column_name
        self.version = version
        self.status_null_value = status_null_value
        self.change_type_value = change_type_value

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self):
        return True

    def transform(self) -> DataFrame:
        """
        Returns:
            DataFrame: A dataframe with the specified column converted to PCDM
        """
        if self.version == 10:
            mapping = obc_field_mappings.OBC_FIELD_MAPPINGS_V10
            df = (
                self.data.withColumn(
                    self.source_column_name,
                    from_json(self.source_column_name, SEM_SCHEMA),
                )
                .select(self.source_column_name + ".readings")
                .melt(
                    ids=["readings.resourceName"],
                    values=["readings.value"],
                    variableColumnName="var",
                    valueColumnName="value",
                )
                .drop("var")
                .select(map_from_arrays("resourceName", "value").alias("resourceName"))
                .select("resourceName.dID", "resourceName.d", "resourceName.t")
                .select(
                    regexp_replace(col("t").cast("string"), "(\d{10})(\d+)", "$1.$2")
                    .cast("double")
                    .alias("timestamp"),
                    "dID",
                    posexplode(split(expr("substring(d, 2, length(d)-2)"), ",")),
                )
                .select(
                    to_timestamp("timestamp").alias("EventTime"),
                    col("dID"),
                    col("pos").cast("string"),
                    col("col").alias("Value"),
                )
                .withColumn(
                    "TagName",
                    concat(
                        col("dID"),
                        lit(":"),
                        udf(lambda row: mapping[row]["TagName"])(col("pos")),
                    ),
                )
                .withColumn(
                    "ValueType", udf(lambda row: mapping[row]["ValueType"])(col("pos"))
                )
                .withColumn("Status", lit(self.status_null_value))
                .withColumn("ChangeType", lit(self.change_type_value))
            )
            return df.select(
                "EventTime", "TagName", "Status", "Value", "ValueType", "ChangeType"
            )
        else:
            return logging.exception(
                "The wrong version was specified. Please use the latest version"
            )

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/sem_json_to_pcdm.py
87
88
89
90
91
92
93
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

transform()

Returns:

Name Type Description
DataFrame DataFrame

A dataframe with the specified column converted to PCDM

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/sem_json_to_pcdm.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def transform(self) -> DataFrame:
    """
    Returns:
        DataFrame: A dataframe with the specified column converted to PCDM
    """
    if self.version == 10:
        mapping = obc_field_mappings.OBC_FIELD_MAPPINGS_V10
        df = (
            self.data.withColumn(
                self.source_column_name,
                from_json(self.source_column_name, SEM_SCHEMA),
            )
            .select(self.source_column_name + ".readings")
            .melt(
                ids=["readings.resourceName"],
                values=["readings.value"],
                variableColumnName="var",
                valueColumnName="value",
            )
            .drop("var")
            .select(map_from_arrays("resourceName", "value").alias("resourceName"))
            .select("resourceName.dID", "resourceName.d", "resourceName.t")
            .select(
                regexp_replace(col("t").cast("string"), "(\d{10})(\d+)", "$1.$2")
                .cast("double")
                .alias("timestamp"),
                "dID",
                posexplode(split(expr("substring(d, 2, length(d)-2)"), ",")),
            )
            .select(
                to_timestamp("timestamp").alias("EventTime"),
                col("dID"),
                col("pos").cast("string"),
                col("col").alias("Value"),
            )
            .withColumn(
                "TagName",
                concat(
                    col("dID"),
                    lit(":"),
                    udf(lambda row: mapping[row]["TagName"])(col("pos")),
                ),
            )
            .withColumn(
                "ValueType", udf(lambda row: mapping[row]["ValueType"])(col("pos"))
            )
            .withColumn("Status", lit(self.status_null_value))
            .withColumn("ChangeType", lit(self.change_type_value))
        )
        return df.select(
            "EventTime", "TagName", "Status", "Value", "ValueType", "ChangeType"
        )
    else:
        return logging.exception(
            "The wrong version was specified. Please use the latest version"
        )

MiricoJsonToPCDMTransformer

Bases: TransformerInterface

Converts a Spark Dataframe column containing a json string created from Mirico to the Process Control Data Model.

Example

from rtdip_sdk.pipelines.transformers import MiricoJsonToPCDMTransformer

mirico_json_to_pcdm_transformer = MiricoJsonToPCDMTransformer(
    data=df
    source_column_name="body",
    status_null_value="Good",
    change_type_value="insert"
    tagname_field="test"
)

result = mirico_json_to_pcdm_transformer.transform()

Parameters:

Name Type Description Default
data DataFrame

Dataframe containing the column with Mirico data

required
source_column_name str

Spark Dataframe column containing the Json Mirico data

required
status_null_value optional str

If populated, will replace 'Good' in the Status column with the specified value.

'Good'
change_type_value optional str

If populated, will replace 'insert' in the ChangeType column with the specified value.

'insert'
tagname_field optional str

If populated, will add the specified field to the TagName column.

None
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/mirico_json_to_pcdm.py
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
class MiricoJsonToPCDMTransformer(TransformerInterface):
    """
    Converts a Spark Dataframe column containing a json string created from Mirico to the Process Control Data Model.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import MiricoJsonToPCDMTransformer

    mirico_json_to_pcdm_transformer = MiricoJsonToPCDMTransformer(
        data=df
        source_column_name="body",
        status_null_value="Good",
        change_type_value="insert"
        tagname_field="test"
    )

    result = mirico_json_to_pcdm_transformer.transform()
    ```

    Parameters:
        data (DataFrame): Dataframe containing the column with Mirico data
        source_column_name (str): Spark Dataframe column containing the Json Mirico data
        status_null_value (optional str): If populated, will replace 'Good' in the Status column with the specified value.
        change_type_value (optional str): If populated, will replace 'insert' in the ChangeType column with the specified value.
        tagname_field (optional str): If populated, will add the specified field to the TagName column.
    """

    data: DataFrame
    source_column_name: str
    status_null_value: str
    change_type_value: str

    def __init__(
        self,
        data: DataFrame,
        source_column_name: str,
        status_null_value: str = "Good",
        change_type_value: str = "insert",
        tagname_field: str = None,
    ) -> None:
        _package_version_meets_minimum("pyspark", "3.4.0")
        self.data = data
        self.source_column_name = source_column_name
        self.status_null_value = status_null_value
        self.change_type_value = change_type_value
        self.tagname_field = tagname_field

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self):
        return True

    def transform(self) -> DataFrame:
        """
        Returns:
            DataFrame: A dataframe with the specified column converted to PCDM
        """

        mapping = mirico_field_mappings.MIRICO_FIELD_MAPPINGS
        df = (
            self.data.withColumn(
                self.source_column_name,
                from_json(self.source_column_name, "map<string,string>"),
            )
            .withColumn("TagName", map_keys("body"))
            .withColumn("Value", map_values("body"))
            .select(
                map_from_arrays("TagName", "Value").alias("x"),
                to_timestamp(col("x.timeStamp")).alias("EventTime"),
                col("x.siteName").alias("siteName"),
                col("x.gasType").alias("gasType"),
                col("x.retroName").alias("retroName"),
            )
            .select("EventTime", "siteName", "gasType", "retroName", posexplode("x"))
            .withColumn(
                "ValueType", udf(lambda row: mapping[row]["ValueType"])(col("pos"))
            )
            .withColumn("Status", lit("Good"))
            .withColumn("ChangeType", lit("insert"))
            .withColumn(
                "TagName",
                when(
                    lit(self.tagname_field).isNotNull(),
                    concat_ws(
                        ":",
                        *[
                            upper(lit(self.tagname_field)),
                            concat_ws(
                                "_",
                                *[
                                    upper(col("siteName")),
                                    upper(col("retroName")),
                                    when(
                                        upper(col("key")) == "GASPPM",
                                        concat_ws(
                                            "_",
                                            *[upper(col("key")), upper(col("gasType"))]
                                        ),
                                    ).otherwise(upper(col("key"))),
                                ]
                            ),
                        ]
                    ),
                ).otherwise(
                    concat_ws(
                        "_",
                        *[
                            upper(col("siteName")),
                            upper(col("retroName")),
                            when(
                                upper(col("key")) == "GASPPM",
                                concat_ws(
                                    "_", *[upper(col("key")), upper(col("gasType"))]
                                ),
                            ).otherwise(upper(col("key"))),
                        ]
                    )
                ),
            )
            .filter(
                ~col("key").isin(
                    "timeStamp",
                    "gasType",
                    "retroLongitude",
                    "retroLatitude",
                    "retroAltitude",
                    "sensorLongitude",
                    "sensorLatitude",
                    "sensorAltitude",
                    "siteName",
                    "siteKey",
                    "retroName",
                )
            )
        )
        return df.select(
            "EventTime", "TagName", "Status", "Value", "ValueType", "ChangeType"
        )

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/mirico_json_to_pcdm.py
86
87
88
89
90
91
92
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

transform()

Returns:

Name Type Description
DataFrame DataFrame

A dataframe with the specified column converted to PCDM

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/mirico_json_to_pcdm.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
def transform(self) -> DataFrame:
    """
    Returns:
        DataFrame: A dataframe with the specified column converted to PCDM
    """

    mapping = mirico_field_mappings.MIRICO_FIELD_MAPPINGS
    df = (
        self.data.withColumn(
            self.source_column_name,
            from_json(self.source_column_name, "map<string,string>"),
        )
        .withColumn("TagName", map_keys("body"))
        .withColumn("Value", map_values("body"))
        .select(
            map_from_arrays("TagName", "Value").alias("x"),
            to_timestamp(col("x.timeStamp")).alias("EventTime"),
            col("x.siteName").alias("siteName"),
            col("x.gasType").alias("gasType"),
            col("x.retroName").alias("retroName"),
        )
        .select("EventTime", "siteName", "gasType", "retroName", posexplode("x"))
        .withColumn(
            "ValueType", udf(lambda row: mapping[row]["ValueType"])(col("pos"))
        )
        .withColumn("Status", lit("Good"))
        .withColumn("ChangeType", lit("insert"))
        .withColumn(
            "TagName",
            when(
                lit(self.tagname_field).isNotNull(),
                concat_ws(
                    ":",
                    *[
                        upper(lit(self.tagname_field)),
                        concat_ws(
                            "_",
                            *[
                                upper(col("siteName")),
                                upper(col("retroName")),
                                when(
                                    upper(col("key")) == "GASPPM",
                                    concat_ws(
                                        "_",
                                        *[upper(col("key")), upper(col("gasType"))]
                                    ),
                                ).otherwise(upper(col("key"))),
                            ]
                        ),
                    ]
                ),
            ).otherwise(
                concat_ws(
                    "_",
                    *[
                        upper(col("siteName")),
                        upper(col("retroName")),
                        when(
                            upper(col("key")) == "GASPPM",
                            concat_ws(
                                "_", *[upper(col("key")), upper(col("gasType"))]
                            ),
                        ).otherwise(upper(col("key"))),
                    ]
                )
            ),
        )
        .filter(
            ~col("key").isin(
                "timeStamp",
                "gasType",
                "retroLongitude",
                "retroLatitude",
                "retroAltitude",
                "sensorLongitude",
                "sensorLatitude",
                "sensorAltitude",
                "siteName",
                "siteKey",
                "retroName",
            )
        )
    )
    return df.select(
        "EventTime", "TagName", "Status", "Value", "ValueType", "ChangeType"
    )

MiricoJsonToMetadataTransformer

Bases: TransformerInterface

Converts a Spark Dataframe column containing a json string created from Mirico to the Metadata Model.

Example

from rtdip_sdk.pipelines.transformers import MiricoJsonToMetadataTransformer

mirico_json_to_metadata_transformer = MiricoJsonToMetadataTransformer(
    data=df
    source_column_name="body"
)

result = mirico_json_to_metadata_transformer.transform()

Parameters:

Name Type Description Default
data DataFrame

Dataframe containing the column with Mirico data

required
source_column_name str

Spark Dataframe column containing the Json Mirico data

required
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/mirico_json_to_metadata.py
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
class MiricoJsonToMetadataTransformer(TransformerInterface):
    """
    Converts a Spark Dataframe column containing a json string created from Mirico to the Metadata Model.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import MiricoJsonToMetadataTransformer

    mirico_json_to_metadata_transformer = MiricoJsonToMetadataTransformer(
        data=df
        source_column_name="body"
    )

    result = mirico_json_to_metadata_transformer.transform()
    ```

    Parameters:
        data (DataFrame): Dataframe containing the column with Mirico data
        source_column_name (str): Spark Dataframe column containing the Json Mirico data
    """

    data: DataFrame
    source_column_name: str

    def __init__(self, data: DataFrame, source_column_name: str) -> None:
        _package_version_meets_minimum("pyspark", "3.4.0")
        self.data = data
        self.source_column_name = source_column_name

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self):
        return True

    def transform(self) -> DataFrame:
        """
        Returns:
            DataFrame: A dataframe with the specified column converted to Metadata model
        """

        df = self.data.select(
            from_json(self.source_column_name, MIRICO_METADATA_SCHEMA).alias("body"),
        )

        tag_name_expr = concat_ws(
            "_",
            *[
                upper(col("body.siteName")),
                upper(col("body.retroName")),
                upper(col("body.gasType")),
            ]
        )

        df = df.select(
            tag_name_expr.alias("TagName"),
            lit("").alias("Description"),
            lit("").alias("UoM"),
            expr(
                """struct(
                body.retroAltitude,
                body.retroLongitude,
                body.retroLatitude,
                body.sensorAltitude,
                body.sensorLongitude,
                body.sensorLatitude)"""
            ).alias("Properties"),
        ).dropDuplicates(["TagName"])

        return df.select("TagName", "Description", "UoM", "Properties")

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/mirico_json_to_metadata.py
62
63
64
65
66
67
68
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

transform()

Returns:

Name Type Description
DataFrame DataFrame

A dataframe with the specified column converted to Metadata model

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/mirico_json_to_metadata.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def transform(self) -> DataFrame:
    """
    Returns:
        DataFrame: A dataframe with the specified column converted to Metadata model
    """

    df = self.data.select(
        from_json(self.source_column_name, MIRICO_METADATA_SCHEMA).alias("body"),
    )

    tag_name_expr = concat_ws(
        "_",
        *[
            upper(col("body.siteName")),
            upper(col("body.retroName")),
            upper(col("body.gasType")),
        ]
    )

    df = df.select(
        tag_name_expr.alias("TagName"),
        lit("").alias("Description"),
        lit("").alias("UoM"),
        expr(
            """struct(
            body.retroAltitude,
            body.retroLongitude,
            body.retroLatitude,
            body.sensorAltitude,
            body.sensorLongitude,
            body.sensorLatitude)"""
        ).alias("Properties"),
    ).dropDuplicates(["TagName"])

    return df.select("TagName", "Description", "UoM", "Properties")

PandasToPySparkTransformer

Bases: TransformerInterface

Converts a Pandas DataFrame to a PySpark DataFrame.

Example

from rtdip_sdk.pipelines.transformers import PandasToPySparkTransformer
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

pandas_to_pyspark = PandasToPySparkTransformer(
    spark=spark,
    df=df,
)

result = pandas_to_pyspark.transform()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session required to convert DataFrame

required
df DataFrame

Pandas DataFrame to be converted

required
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pandas_to_pyspark.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
class PandasToPySparkTransformer(TransformerInterface):
    """
    Converts a Pandas DataFrame to a PySpark DataFrame.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import PandasToPySparkTransformer
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    pandas_to_pyspark = PandasToPySparkTransformer(
        spark=spark,
        df=df,
    )

    result = pandas_to_pyspark.transform()
    ```

    Parameters:
        spark (SparkSession): Spark Session required to convert DataFrame
        df (DataFrame): Pandas DataFrame to be converted
    """

    spark: SparkSession
    df: PandasDataFrame

    def __init__(self, spark: SparkSession, df: PandasDataFrame) -> None:
        self.spark = spark
        self.df = df

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self):
        return True

    def transform(self) -> PySparkDataFrame:
        """
        Returns:
            DataFrame: A PySpark dataframe converted from a Pandas DataFrame.
        """

        self.df = _prepare_pandas_to_convert_to_spark(self.df)
        df = self.spark.createDataFrame(self.df)

        return df

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pandas_to_pyspark.py
57
58
59
60
61
62
63
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

transform()

Returns:

Name Type Description
DataFrame DataFrame

A PySpark dataframe converted from a Pandas DataFrame.

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pandas_to_pyspark.py
80
81
82
83
84
85
86
87
88
89
def transform(self) -> PySparkDataFrame:
    """
    Returns:
        DataFrame: A PySpark dataframe converted from a Pandas DataFrame.
    """

    self.df = _prepare_pandas_to_convert_to_spark(self.df)
    df = self.spark.createDataFrame(self.df)

    return df

PySparkToPandasTransformer

Bases: TransformerInterface

Converts a PySpark DataFrame to a Pandas DataFrame.

Example

from rtdip_sdk.pipelines.transformers import PySparkToPandasTransformer

pyspark_to_pandas = PySparkToPandasTransformer(
    df=df
)

result = pyspark_to_pandas.transform()

Parameters:

Name Type Description Default
df DataFrame

PySpark DataFrame to be converted

required
Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pyspark_to_pandas.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
class PySparkToPandasTransformer(TransformerInterface):
    """
    Converts a PySpark DataFrame to a Pandas DataFrame.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import PySparkToPandasTransformer

    pyspark_to_pandas = PySparkToPandasTransformer(
        df=df
    )

    result = pyspark_to_pandas.transform()
    ```

    Parameters:
        df (DataFrame): PySpark DataFrame to be converted
    """

    df: PySparkDataFrame

    def __init__(self, df: PySparkDataFrame) -> None:
        self.df = df

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self):
        return True

    def transform(self) -> PandasDataFrame:
        """
        Returns:
            DataFrame: A Pandas dataframe converted from a PySpark DataFrame.
        """
        df = self.df.toPandas()
        return df

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pyspark_to_pandas.py
47
48
49
50
51
52
53
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

transform()

Returns:

Name Type Description
DataFrame DataFrame

A Pandas dataframe converted from a PySpark DataFrame.

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pyspark_to_pandas.py
70
71
72
73
74
75
76
def transform(self) -> PandasDataFrame:
    """
    Returns:
        DataFrame: A Pandas dataframe converted from a PySpark DataFrame.
    """
    df = self.df.toPandas()
    return df

SparkDeltaDestination

Bases: DestinationInterface

The Spark Delta Destination is used to write data to a Delta table.

Examples

#Delta Destination for Streaming Queries

from rtdip_sdk.pipelines.destinations import SparkDeltaDestination

delta_destination = SparkDeltaDestination(
    data=df,
    options={
        "checkpointLocation": "/{CHECKPOINT-LOCATION}/"
    },
    destination="DELTA-TABLE-PATH",
    mode="append",
    trigger="10 seconds",
    query_name="DeltaDestination",
    query_wait_interval=None
)

delta_destination.write_stream()
#Delta Destination for Batch Queries

from rtdip_sdk.pipelines.destinations import SparkDeltaDestination

delta_destination = SparkDeltaDestination(
    data=df,
    options={
        "overwriteSchema": True
    },
    destination="DELTA-TABLE-PATH",
    mode="append",
    trigger="10 seconds",
    query_name="DeltaDestination",
    query_wait_interval=None
)

delta_destination.write_batch()

Parameters:

Name Type Description Default
data DataFrame

Dataframe to be written to Delta

required
options dict

Options that can be specified for a Delta Table write operation (See Attributes table below). Further information on the options is available for batch and streaming.

required
destination str

Either the name of the Hive Metastore or Unity Catalog Delta Table or the path to the Delta table

required
mode optional str

Method of writing to Delta Table - append/overwrite (batch), append/update/complete (stream). Default is append

'append'
trigger optional str

Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds

'10 seconds'
query_name optional str

Unique name for the query in associated SparkSession. (stream) Default is DeltaDestination

'DeltaDestination'
query_wait_interval optional int

If set, waits for the streaming query to complete before returning. (stream) Default is None

None

Attributes:

Name Type Description
checkpointLocation str

Path to checkpoint files. (Streaming)

txnAppId str

A unique string that you can pass on each DataFrame write. (Batch & Streaming)

txnVersion str

A monotonically increasing number that acts as transaction version. (Batch & Streaming)

maxRecordsPerFile int str

Specify the maximum number of records to write to a single file for a Delta Lake table. (Batch)

replaceWhere str

Condition(s) for overwriting. (Batch)

partitionOverwriteMode str

When set to dynamic, overwrites all existing data in each logical partition for which the write will commit new data. Default is static. (Batch)

overwriteSchema bool str

If True, overwrites the schema as well as the table data. (Batch)

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/delta.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
class SparkDeltaDestination(DestinationInterface):
    """
    The Spark Delta Destination is used to write data to a Delta table.

    Examples
    --------
    ```python
    #Delta Destination for Streaming Queries

    from rtdip_sdk.pipelines.destinations import SparkDeltaDestination

    delta_destination = SparkDeltaDestination(
        data=df,
        options={
            "checkpointLocation": "/{CHECKPOINT-LOCATION}/"
        },
        destination="DELTA-TABLE-PATH",
        mode="append",
        trigger="10 seconds",
        query_name="DeltaDestination",
        query_wait_interval=None
    )

    delta_destination.write_stream()
    ```
    ```python
    #Delta Destination for Batch Queries

    from rtdip_sdk.pipelines.destinations import SparkDeltaDestination

    delta_destination = SparkDeltaDestination(
        data=df,
        options={
            "overwriteSchema": True
        },
        destination="DELTA-TABLE-PATH",
        mode="append",
        trigger="10 seconds",
        query_name="DeltaDestination",
        query_wait_interval=None
    )

    delta_destination.write_batch()
    ```

    Parameters:
        data (DataFrame): Dataframe to be written to Delta
        options (dict): Options that can be specified for a Delta Table write operation (See Attributes table below). Further information on the options is available for [batch](https://docs.delta.io/latest/delta-batch.html#write-to-a-table){ target="_blank" } and [streaming](https://docs.delta.io/latest/delta-streaming.html#delta-table-as-a-sink){ target="_blank" }.
        destination (str): Either the name of the Hive Metastore or Unity Catalog Delta Table **or** the path to the Delta table
        mode (optional str): Method of writing to Delta Table - append/overwrite (batch), append/update/complete (stream). Default is append
        trigger (optional str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds
        query_name (optional str): Unique name for the query in associated SparkSession. (stream) Default is DeltaDestination
        query_wait_interval (optional int): If set, waits for the streaming query to complete before returning. (stream) Default is None

    Attributes:
        checkpointLocation (str): Path to checkpoint files. (Streaming)
        txnAppId (str): A unique string that you can pass on each DataFrame write. (Batch & Streaming)
        txnVersion (str): A monotonically increasing number that acts as transaction version. (Batch & Streaming)
        maxRecordsPerFile (int str): Specify the maximum number of records to write to a single file for a Delta Lake table. (Batch)
        replaceWhere (str): Condition(s) for overwriting. (Batch)
        partitionOverwriteMode (str): When set to dynamic, overwrites all existing data in each logical partition for which the write will commit new data. Default is static. (Batch)
        overwriteSchema (bool str): If True, overwrites the schema as well as the table data. (Batch)
    """

    data: DataFrame
    options: dict
    destination: str
    mode: str
    trigger: str
    query_name: str
    query_wait_interval: int

    def __init__(
        self,
        data: DataFrame,
        options: dict,
        destination: str,
        mode: str = "append",
        trigger: str = "10 seconds",
        query_name: str = "DeltaDestination",
        query_wait_interval: int = None,
    ) -> None:
        self.data = data
        self.options = options
        self.destination = destination
        self.mode = mode
        self.trigger = trigger
        self.query_name = query_name
        self.query_wait_interval = query_wait_interval

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        libraries.add_maven_library(get_default_package("spark_delta_core"))
        return libraries

    @staticmethod
    def settings() -> dict:
        return {
            "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension",
            "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog",
        }

    def pre_write_validation(self):
        return True

    def post_write_validation(self):
        return True

    def write_batch(self):
        """
        Writes batch data to Delta. Most of the options provided by the Apache Spark DataFrame write API are supported for performing batch writes on tables.
        """
        try:
            if "/" in self.destination:
                return (
                    self.data.write.format("delta")
                    .mode(self.mode)
                    .options(**self.options)
                    .save(self.destination)
                )
            else:
                return (
                    self.data.write.format("delta")
                    .mode(self.mode)
                    .options(**self.options)
                    .saveAsTable(self.destination)
                )

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

    def write_stream(self):
        """
        Writes streaming data to Delta. Exactly-once processing is guaranteed
        """
        TRIGGER_OPTION = (
            {"availableNow": True}
            if self.trigger == "availableNow"
            else {"processingTime": self.trigger}
        )
        try:
            if "/" in self.destination:
                query = (
                    self.data.writeStream.trigger(**TRIGGER_OPTION)
                    .format("delta")
                    .queryName(self.query_name)
                    .outputMode(self.mode)
                    .options(**self.options)
                    .start(self.destination)
                )
            else:
                query = (
                    self.data.writeStream.trigger(**TRIGGER_OPTION)
                    .format("delta")
                    .queryName(self.query_name)
                    .outputMode(self.mode)
                    .options(**self.options)
                    .toTable(self.destination)
                )

            if self.query_wait_interval:
                while query.isActive:
                    if query.lastProgress:
                        logging.info(query.lastProgress)
                    time.sleep(self.query_wait_interval)

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/delta.py
115
116
117
118
119
120
121
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

write_batch()

Writes batch data to Delta. Most of the options provided by the Apache Spark DataFrame write API are supported for performing batch writes on tables.

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/delta.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def write_batch(self):
    """
    Writes batch data to Delta. Most of the options provided by the Apache Spark DataFrame write API are supported for performing batch writes on tables.
    """
    try:
        if "/" in self.destination:
            return (
                self.data.write.format("delta")
                .mode(self.mode)
                .options(**self.options)
                .save(self.destination)
            )
        else:
            return (
                self.data.write.format("delta")
                .mode(self.mode)
                .options(**self.options)
                .saveAsTable(self.destination)
            )

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

write_stream()

Writes streaming data to Delta. Exactly-once processing is guaranteed

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/delta.py
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def write_stream(self):
    """
    Writes streaming data to Delta. Exactly-once processing is guaranteed
    """
    TRIGGER_OPTION = (
        {"availableNow": True}
        if self.trigger == "availableNow"
        else {"processingTime": self.trigger}
    )
    try:
        if "/" in self.destination:
            query = (
                self.data.writeStream.trigger(**TRIGGER_OPTION)
                .format("delta")
                .queryName(self.query_name)
                .outputMode(self.mode)
                .options(**self.options)
                .start(self.destination)
            )
        else:
            query = (
                self.data.writeStream.trigger(**TRIGGER_OPTION)
                .format("delta")
                .queryName(self.query_name)
                .outputMode(self.mode)
                .options(**self.options)
                .toTable(self.destination)
            )

        if self.query_wait_interval:
            while query.isActive:
                if query.lastProgress:
                    logging.info(query.lastProgress)
                time.sleep(self.query_wait_interval)

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

SparkDeltaMergeDestination

Bases: DestinationInterface

The Spark Delta Merge Destination is used to merge data into a Delta table. Refer to this documentation for more information about Delta Merge.

Examples

#Delta Merge Destination for Streaming Queries

from rtdip_sdk.pipelines.destinations import SparkDeltaMergeDestination

delta_merge_destination = SparkDeltaMergeDestination(
    data=df,
    destination="DELTA-TABLE-PATH",
    options={
        "checkpointLocation": "/{CHECKPOINT-LOCATION}/"
    },
    merge_condition="`source.id = target.id`"
    when_matched_update_list=None
    when_matched_delete_list=None
    when_not_matched_insert_list=None
    when_not_matched_by_source_update_list=None
    when_not_matched_by_source_delete_list=None
    try_broadcast_join=False
    trigger="10 seconds",
    query_name="DeltaDestination"
    query_wait_interval=None
)

delta_merge_destination.write_stream()
#Delta Merge Destination for Batch Queries

from rtdip_sdk.pipelines.destinations import SparkDeltaMergeDestination

delta_merge_destination = SparkDeltaMergeDestination(
    data=df,
    destination="DELTA-TABLE-PATH",
    options={},
    merge_condition="`source.id = target.id`",
    when_matched_update_list=None,
    when_matched_delete_list=None,
    when_not_matched_insert_list=None,
    when_not_matched_by_source_update_list=None,
    when_not_matched_by_source_delete_list=None,
    try_broadcast_join=False,
    trigger="10 seconds",
    query_name="DeltaDestination"
    query_wait_interval=None
)

delta_merge_destination.write_batch()

Parameters:

Name Type Description Default
data DataFrame

Dataframe to be merged into a Delta Table

required
destination str

Either the name of the Hive Metastore or Unity Catalog Delta Table or the path to the Delta table

required
options dict

Options that can be specified for a Delta Table read operation (See Attributes table below). Further information on the options is available for batch and streaming.

required
merge_condition str

Condition for matching records between dataframe and delta table. Reference Dataframe columns as source and Delta Table columns as target. For example source.id = target.id.

required
when_matched_update_list optional list[DeltaMergeConditionValues]

Conditions(optional) and values to be used when updating rows that match the merge_condition. Specify * for Values if all columns from Dataframe should be inserted.

None
when_matched_delete_list optional list[DeltaMergeCondition]

Conditions(optional) to be used when deleting rows that match the merge_condition.

None
when_not_matched_insert_list optional list[DeltaMergeConditionValues]

Conditions(optional) and values to be used when inserting rows that do not match the merge_condition. Specify * for Values if all columns from Dataframe should be inserted.

None
when_not_matched_by_source_update_list optional list[DeltaMergeConditionValues]

Conditions(optional) and values to be used when updating rows that do not match the merge_condition.

None
when_not_matched_by_source_delete_list optional list[DeltaMergeCondition]

Conditions(optional) to be used when deleting rows that do not match the merge_condition.

None
try_broadcast_join optional bool

Attempts to perform a broadcast join in the merge which can leverage data skipping using partition pruning and file pruning automatically. Can fail if dataframe being merged is large and therefore more suitable for streaming merges than batch merges

False
trigger optional str

Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds

'10 seconds'
query_name optional str

Unique name for the query in associated SparkSession

'DeltaMergeDestination'
query_wait_interval optional int

If set, waits for the streaming query to complete before returning. (stream) Default is None

None

Attributes:

Name Type Description
checkpointLocation str

Path to checkpoint files. (Streaming)

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/delta_merge.py
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
class SparkDeltaMergeDestination(DestinationInterface):
    """
    The Spark Delta Merge Destination is used to merge data into a Delta table. Refer to this [documentation](https://docs.delta.io/latest/delta-update.html#upsert-into-a-table-using-merge&language-python) for more information about Delta Merge.

    Examples
    --------
    ```python
    #Delta Merge Destination for Streaming Queries

    from rtdip_sdk.pipelines.destinations import SparkDeltaMergeDestination

    delta_merge_destination = SparkDeltaMergeDestination(
        data=df,
        destination="DELTA-TABLE-PATH",
        options={
            "checkpointLocation": "/{CHECKPOINT-LOCATION}/"
        },
        merge_condition="`source.id = target.id`"
        when_matched_update_list=None
        when_matched_delete_list=None
        when_not_matched_insert_list=None
        when_not_matched_by_source_update_list=None
        when_not_matched_by_source_delete_list=None
        try_broadcast_join=False
        trigger="10 seconds",
        query_name="DeltaDestination"
        query_wait_interval=None
    )

    delta_merge_destination.write_stream()
    ```
    ```python
    #Delta Merge Destination for Batch Queries

    from rtdip_sdk.pipelines.destinations import SparkDeltaMergeDestination

    delta_merge_destination = SparkDeltaMergeDestination(
        data=df,
        destination="DELTA-TABLE-PATH",
        options={},
        merge_condition="`source.id = target.id`",
        when_matched_update_list=None,
        when_matched_delete_list=None,
        when_not_matched_insert_list=None,
        when_not_matched_by_source_update_list=None,
        when_not_matched_by_source_delete_list=None,
        try_broadcast_join=False,
        trigger="10 seconds",
        query_name="DeltaDestination"
        query_wait_interval=None
    )

    delta_merge_destination.write_batch()
    ```

    Parameters:
        data (DataFrame): Dataframe to be merged into a Delta Table
        destination (str): Either the name of the Hive Metastore or Unity Catalog Delta Table **or** the path to the Delta table
        options (dict): Options that can be specified for a Delta Table read operation (See Attributes table below). Further information on the options is available for [batch](https://docs.delta.io/latest/delta-batch.html#write-to-a-table){ target="_blank" } and [streaming](https://docs.delta.io/latest/delta-streaming.html#delta-table-as-a-sink){ target="_blank" }.
        merge_condition (str): Condition for matching records between dataframe and delta table. Reference Dataframe columns as `source` and Delta Table columns as `target`. For example `source.id = target.id`.
        when_matched_update_list (optional list[DeltaMergeConditionValues]): Conditions(optional) and values to be used when updating rows that match the `merge_condition`. Specify `*` for Values if all columns from Dataframe should be inserted.
        when_matched_delete_list (optional list[DeltaMergeCondition]): Conditions(optional) to be used when deleting rows that match the `merge_condition`.
        when_not_matched_insert_list (optional list[DeltaMergeConditionValues]): Conditions(optional) and values to be used when inserting rows that do not match the `merge_condition`. Specify `*` for Values if all columns from Dataframe should be inserted.
        when_not_matched_by_source_update_list (optional list[DeltaMergeConditionValues]): Conditions(optional) and values to be used when updating rows that do not match the `merge_condition`.
        when_not_matched_by_source_delete_list (optional list[DeltaMergeCondition]): Conditions(optional) to be used when deleting rows that do not match the `merge_condition`.
        try_broadcast_join (optional bool): Attempts to perform a broadcast join in the merge which can leverage data skipping using partition pruning and file pruning automatically. Can fail if dataframe being merged is large and therefore more suitable for streaming merges than batch merges
        trigger (optional str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds
        query_name (optional str): Unique name for the query in associated SparkSession
        query_wait_interval (optional int): If set, waits for the streaming query to complete before returning. (stream) Default is None

    Attributes:
        checkpointLocation (str): Path to checkpoint files. (Streaming)
    """

    spark: SparkSession
    data: DataFrame
    destination: str
    options: dict
    merge_condition: str
    when_matched_update_list: List[DeltaMergeConditionValues]
    when_matched_delete_list: List[DeltaMergeCondition]
    when_not_matched_insert_list: List[DeltaMergeConditionValues]
    when_not_matched_by_source_update_list: List[DeltaMergeConditionValues]
    when_not_matched_by_source_delete_list: List[DeltaMergeCondition]
    try_broadcast_join: bool
    trigger: str
    query_name: str
    query_wait_interval: int

    def __init__(
        self,
        spark: SparkSession,
        data: DataFrame,
        destination: str,
        options: dict,
        merge_condition: str,
        when_matched_update_list: List[DeltaMergeConditionValues] = None,
        when_matched_delete_list: List[DeltaMergeCondition] = None,
        when_not_matched_insert_list: List[DeltaMergeConditionValues] = None,
        when_not_matched_by_source_update_list: List[DeltaMergeConditionValues] = None,
        when_not_matched_by_source_delete_list: List[DeltaMergeCondition] = None,
        try_broadcast_join: bool = False,
        trigger="10 seconds",
        query_name: str = "DeltaMergeDestination",
        query_wait_interval: int = None,
    ) -> None:
        self.spark = spark
        self.data = data
        self.destination = destination
        self.options = options
        self.merge_condition = merge_condition
        self.when_matched_update_list = (
            [] if when_matched_update_list is None else when_matched_update_list
        )
        self.when_matched_delete_list = (
            [] if when_matched_delete_list is None else when_matched_delete_list
        )
        self.when_not_matched_insert_list = (
            [] if when_not_matched_insert_list is None else when_not_matched_insert_list
        )
        if (
            isinstance(when_not_matched_by_source_update_list, list)
            and len(when_not_matched_by_source_update_list) > 0
        ):
            _package_version_meets_minimum("delta-spark", "2.3.0")
        self.when_not_matched_by_source_update_list = (
            []
            if when_not_matched_by_source_update_list is None
            else when_not_matched_by_source_update_list
        )
        if (
            isinstance(when_not_matched_by_source_delete_list, list)
            and len(when_not_matched_by_source_delete_list) > 0
        ):
            _package_version_meets_minimum("delta-spark", "2.3.0")
        self.when_not_matched_by_source_delete_list = (
            []
            if when_not_matched_by_source_delete_list is None
            else when_not_matched_by_source_delete_list
        )
        self.try_broadcast_join = try_broadcast_join
        self.trigger = trigger
        self.query_name = query_name
        self.query_wait_interval = query_wait_interval

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        libraries.add_maven_library(get_default_package("spark_delta_core"))
        return libraries

    @staticmethod
    def settings() -> dict:
        return {
            "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension",
            "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog",
            "spark.databricks.delta.schema.autoMerge.enabled": "true",
        }

    def pre_write_validation(self):
        return True

    def post_write_validation(self):
        return True

    def _delta_merge_builder(
        self, df: DataFrame, try_broadcast_join: bool
    ) -> DeltaMergeBuilder:
        if "/" in self.destination:
            delta_table = DeltaTable.forPath(self.spark, self.destination)
        else:
            delta_table = DeltaTable.forName(self.spark, self.destination)

        if try_broadcast_join == True:
            delta_merge_builder = delta_table.alias("target").merge(
                source=broadcast(df).alias("source"), condition=self.merge_condition
            )
        else:
            delta_merge_builder = delta_table.alias("target").merge(
                source=df.alias("source"), condition=self.merge_condition
            )

        for when_matched_update in self.when_matched_update_list:
            if when_matched_update.values == "*":
                delta_merge_builder = delta_merge_builder.whenMatchedUpdateAll(
                    condition=when_matched_update.condition,
                )
            else:
                delta_merge_builder = delta_merge_builder.whenMatchedUpdate(
                    condition=when_matched_update.condition,
                    set=when_matched_update.values,
                )

        for when_matched_delete in self.when_matched_delete_list:
            delta_merge_builder = delta_merge_builder.whenMatchedDelete(
                condition=when_matched_delete.condition,
            )

        for when_not_matched_insert in self.when_not_matched_insert_list:
            if when_not_matched_insert.values == "*":
                delta_merge_builder = delta_merge_builder.whenNotMatchedInsertAll(
                    condition=when_not_matched_insert.condition,
                )
            else:
                delta_merge_builder = delta_merge_builder.whenNotMatchedInsert(
                    condition=when_not_matched_insert.condition,
                    values=when_not_matched_insert.values,
                )

        for (
            when_not_matched_by_source_update
        ) in self.when_not_matched_by_source_update_list:
            delta_merge_builder = delta_merge_builder.whenNotMatchedBySourceUpdate(
                condition=when_not_matched_by_source_update.condition,
                set=when_not_matched_by_source_update.values,
            )

        for (
            when_not_matched_by_source_delete
        ) in self.when_not_matched_by_source_delete_list:
            delta_merge_builder = delta_merge_builder.whenNotMatchedBySourceDelete(
                condition=when_not_matched_by_source_delete.condition,
            )

        return delta_merge_builder

    def _stream_merge_micro_batch(
        self, micro_batch_df: DataFrame, epoch_id=None
    ):  # NOSONAR
        micro_batch_df.persist()

        retry_delta_merge = False

        if self.try_broadcast_join == True:
            try:
                delta_merge = self._delta_merge_builder(
                    micro_batch_df, self.try_broadcast_join
                )
                delta_merge.execute()
            except Exception as e:
                if "SparkOutOfMemoryError" in str(e):
                    retry_delta_merge = True
                else:
                    raise e

        if self.try_broadcast_join == False or retry_delta_merge == True:
            delta_merge = self._delta_merge_builder(micro_batch_df, False)
            delta_merge.execute()

        micro_batch_df.unpersist()

    def write_batch(self):
        """
        Merges batch data into a Delta Table.
        """
        try:
            delta_merge = self._delta_merge_builder(self.data, self.try_broadcast_join)
            return delta_merge.execute()

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

    def write_stream(self):
        """
        Merges streaming data to Delta using foreachBatch
        """
        TRIGGER_OPTION = (
            {"availableNow": True}
            if self.trigger == "availableNow"
            else {"processingTime": self.trigger}
        )
        try:
            query = (
                self.data.writeStream.trigger(**TRIGGER_OPTION)
                .format("delta")
                .foreachBatch(self._stream_merge_micro_batch)
                .queryName(self.query_name)
                .outputMode("update")
                .options(**self.options)
                .start()
            )

            if self.query_wait_interval:
                while query.isActive:
                    if query.lastProgress:
                        logging.info(query.lastProgress)
                    time.sleep(self.query_wait_interval)

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/delta_merge.py
184
185
186
187
188
189
190
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

write_batch()

Merges batch data into a Delta Table.

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/delta_merge.py
298
299
300
301
302
303
304
305
306
307
308
309
310
311
def write_batch(self):
    """
    Merges batch data into a Delta Table.
    """
    try:
        delta_merge = self._delta_merge_builder(self.data, self.try_broadcast_join)
        return delta_merge.execute()

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

write_stream()

Merges streaming data to Delta using foreachBatch

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/delta_merge.py
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
def write_stream(self):
    """
    Merges streaming data to Delta using foreachBatch
    """
    TRIGGER_OPTION = (
        {"availableNow": True}
        if self.trigger == "availableNow"
        else {"processingTime": self.trigger}
    )
    try:
        query = (
            self.data.writeStream.trigger(**TRIGGER_OPTION)
            .format("delta")
            .foreachBatch(self._stream_merge_micro_batch)
            .queryName(self.query_name)
            .outputMode("update")
            .options(**self.options)
            .start()
        )

        if self.query_wait_interval:
            while query.isActive:
                if query.lastProgress:
                    logging.info(query.lastProgress)
                time.sleep(self.query_wait_interval)

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

SparkEventhubDestination

Bases: DestinationInterface

This Spark destination class is used to write batch or streaming data to Eventhubs. Eventhub configurations need to be specified as options in a dictionary. Additionally, there are more optional configurations which can be found here. If using startingPosition or endingPosition make sure to check out Event Position section for more details and examples.

Examples

#Eventhub Destination for Streaming Queries

from rtdip_sdk.pipelines.destinations import SparkEventhubDestination
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

connectionString = Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}

eventhub_destination = SparkEventhubDestination(
    spark=spark,
    data=df,
    options={
        "eventhubs.connectionString": connectionString,
        "eventhubs.consumerGroup": "{YOUR-EVENTHUB-CONSUMER-GROUP}",
        "checkpointLocation": "/{CHECKPOINT-LOCATION}/"
    },
    trigger="10 seconds",
    query_name="EventhubDestination",
    query_wait_interval=None
)

eventhub_destination.write_stream()
#Eventhub Destination for Batch Queries

from rtdip_sdk.pipelines.destinations import SparkEventhubDestination
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

connectionString = Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}


eventhub_destination = SparkEventhubDestination(
    spark=spark,
    data=df,
    options={
        "eventhubs.connectionString": connectionString,
        "eventhubs.consumerGroup": "{YOUR-EVENTHUB-CONSUMER-GROUP}"
    },
    trigger="10 seconds",
    query_name="EventhubDestination",
    query_wait_interval=None
)

eventhub_destination.write_batch()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session

required
data DataFrame

Dataframe to be written to Eventhub

required
options dict

A dictionary of Eventhub configurations (See Attributes table below). All Configuration options for Eventhubs can be found here.

required
trigger optional str

Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds

'10 seconds'
query_name str

Unique name for the query in associated SparkSession

'EventhubDestination'
query_wait_interval optional int

If set, waits for the streaming query to complete before returning. (stream) Default is None

None

Attributes:

Name Type Description
checkpointLocation str

Path to checkpoint files. (Streaming)

eventhubs.connectionString str

Eventhubs connection string is required to connect to the Eventhubs service. (Streaming and Batch)

eventhubs.consumerGroup str

A consumer group is a view of an entire eventhub. Consumer groups enable multiple consuming applications to each have a separate view of the event stream, and to read the stream independently at their own pace and with their own offsets. (Streaming and Batch)

eventhubs.startingPosition JSON str

The starting position for your Structured Streaming job. If a specific EventPosition is not set for a partition using startingPositions, then we use the EventPosition set in startingPosition. If nothing is set in either option, we will begin consuming from the end of the partition. (Streaming and Batch)

eventhubs.endingPosition JSON str

(JSON str): The ending position of a batch query. This works the same as startingPosition. (Batch)

maxEventsPerTrigger long

Rate limit on maximum number of events processed per trigger interval. The specified total number of events will be proportionally split across partitions of different volume. (Stream)

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/eventhub.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
class SparkEventhubDestination(DestinationInterface):
    """
    This Spark destination class is used to write batch or streaming data to Eventhubs. Eventhub configurations need to be specified as options in a dictionary.
    Additionally, there are more optional configurations which can be found [here.](https://github.com/Azure/azure-event-hubs-spark/blob/master/docs/PySpark/structured-streaming-pyspark.md#event-hubs-configuration){ target="_blank" }
    If using startingPosition or endingPosition make sure to check out **Event Position** section for more details and examples.

    Examples
    --------
    ```python
    #Eventhub Destination for Streaming Queries

    from rtdip_sdk.pipelines.destinations import SparkEventhubDestination
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    connectionString = Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}

    eventhub_destination = SparkEventhubDestination(
        spark=spark,
        data=df,
        options={
            "eventhubs.connectionString": connectionString,
            "eventhubs.consumerGroup": "{YOUR-EVENTHUB-CONSUMER-GROUP}",
            "checkpointLocation": "/{CHECKPOINT-LOCATION}/"
        },
        trigger="10 seconds",
        query_name="EventhubDestination",
        query_wait_interval=None
    )

    eventhub_destination.write_stream()
    ```
    ```python
    #Eventhub Destination for Batch Queries

    from rtdip_sdk.pipelines.destinations import SparkEventhubDestination
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    connectionString = Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}


    eventhub_destination = SparkEventhubDestination(
        spark=spark,
        data=df,
        options={
            "eventhubs.connectionString": connectionString,
            "eventhubs.consumerGroup": "{YOUR-EVENTHUB-CONSUMER-GROUP}"
        },
        trigger="10 seconds",
        query_name="EventhubDestination",
        query_wait_interval=None
    )

    eventhub_destination.write_batch()
    ```

    Parameters:
        spark (SparkSession): Spark Session
        data (DataFrame): Dataframe to be written to Eventhub
        options (dict): A dictionary of Eventhub configurations (See Attributes table below). All Configuration options for Eventhubs can be found [here.](https://github.com/Azure/azure-event-hubs-spark/blob/master/docs/PySpark/structured-streaming-pyspark.md#event-hubs-configuration){ target="_blank" }
        trigger (optional str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds
        query_name (str): Unique name for the query in associated SparkSession
        query_wait_interval (optional int): If set, waits for the streaming query to complete before returning. (stream) Default is None

    Attributes:
        checkpointLocation (str): Path to checkpoint files. (Streaming)
        eventhubs.connectionString (str):  Eventhubs connection string is required to connect to the Eventhubs service. (Streaming and Batch)
        eventhubs.consumerGroup (str): A consumer group is a view of an entire eventhub. Consumer groups enable multiple consuming applications to each have a separate view of the event stream, and to read the stream independently at their own pace and with their own offsets. (Streaming and Batch)
        eventhubs.startingPosition (JSON str): The starting position for your Structured Streaming job. If a specific EventPosition is not set for a partition using startingPositions, then we use the EventPosition set in startingPosition. If nothing is set in either option, we will begin consuming from the end of the partition. (Streaming and Batch)
        eventhubs.endingPosition: (JSON str): The ending position of a batch query. This works the same as startingPosition. (Batch)
        maxEventsPerTrigger (long): Rate limit on maximum number of events processed per trigger interval. The specified total number of events will be proportionally split across partitions of different volume. (Stream)
    """

    spark: SparkSession
    data: DataFrame
    options: dict
    trigger: str
    query_name: str
    query_wait_interval: int

    def __init__(
        self,
        spark: SparkSession,
        data: DataFrame,
        options: dict,
        trigger="10 seconds",
        query_name="EventhubDestination",
        query_wait_interval: int = None,
    ) -> None:
        self.spark = spark
        self.data = data
        self.options = options
        self.trigger = trigger
        self.query_name = query_name
        self.query_wait_interval = query_wait_interval

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        spark_libraries = Libraries()
        spark_libraries.add_maven_library(get_default_package("spark_azure_eventhub"))
        return spark_libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_write_validation(self):
        return True

    def post_write_validation(self):
        return True

    def prepare_columns(self):
        if "body" in self.data.columns:
            if self.data.schema["body"].dataType not in [StringType(), BinaryType()]:
                try:
                    self.data.withColumn("body", col("body").cast(StringType()))
                except Exception as e:
                    raise ValueError(
                        "'body' column must be of string or binary type", e
                    )
        else:
            self.data = self.data.withColumn(
                "body",
                to_json(
                    struct(
                        [
                            col(column).alias(column)
                            for column in self.data.columns
                            if column not in ["partitionId", "partitionKey"]
                        ]
                    )
                ),
            )
        for column in self.data.schema:
            if (
                column.name in ["partitionId", "partitionKey"]
                and column.dataType != StringType()
            ):
                try:
                    self.data = self.data.withColumn(
                        column.name, col(column.name).cast(StringType())
                    )
                except Exception as e:
                    raise ValueError(f"Column {column.name} must be of string type", e)
        return self.data.select(
            [
                column
                for column in self.data.columns
                if column in ["partitionId", "partitionKey", "body"]
            ]
        )

    def write_batch(self):
        """
        Writes batch data to Eventhubs.
        """
        eventhub_connection_string = "eventhubs.connectionString"
        try:
            if eventhub_connection_string in self.options:
                sc = self.spark.sparkContext
                self.options[eventhub_connection_string] = (
                    sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(
                        self.options[eventhub_connection_string]
                    )
                )
            df = self.prepare_columns()
            return df.write.format("eventhubs").options(**self.options).save()

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

    def write_stream(self):
        """
        Writes steaming data to Eventhubs.
        """
        eventhub_connection_string = "eventhubs.connectionString"
        try:
            TRIGGER_OPTION = (
                {"availableNow": True}
                if self.trigger == "availableNow"
                else {"processingTime": self.trigger}
            )
            if eventhub_connection_string in self.options:
                sc = self.spark.sparkContext
                self.options[eventhub_connection_string] = (
                    sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(
                        self.options[eventhub_connection_string]
                    )
                )
            df = self.prepare_columns()
            df = self.data.select(
                [
                    column
                    for column in self.data.columns
                    if column in ["partitionId", "partitionKey", "body"]
                ]
            )
            query = (
                df.writeStream.trigger(**TRIGGER_OPTION)
                .format("eventhubs")
                .options(**self.options)
                .queryName(self.query_name)
                .start()
            )

            if self.query_wait_interval:
                while query.isActive:
                    if query.lastProgress:
                        logging.info(query.lastProgress)
                    time.sleep(self.query_wait_interval)

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/eventhub.py
130
131
132
133
134
135
136
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

write_batch()

Writes batch data to Eventhubs.

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/eventhub.py
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
def write_batch(self):
    """
    Writes batch data to Eventhubs.
    """
    eventhub_connection_string = "eventhubs.connectionString"
    try:
        if eventhub_connection_string in self.options:
            sc = self.spark.sparkContext
            self.options[eventhub_connection_string] = (
                sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(
                    self.options[eventhub_connection_string]
                )
            )
        df = self.prepare_columns()
        return df.write.format("eventhubs").options(**self.options).save()

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

write_stream()

Writes steaming data to Eventhubs.

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/eventhub.py
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
def write_stream(self):
    """
    Writes steaming data to Eventhubs.
    """
    eventhub_connection_string = "eventhubs.connectionString"
    try:
        TRIGGER_OPTION = (
            {"availableNow": True}
            if self.trigger == "availableNow"
            else {"processingTime": self.trigger}
        )
        if eventhub_connection_string in self.options:
            sc = self.spark.sparkContext
            self.options[eventhub_connection_string] = (
                sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(
                    self.options[eventhub_connection_string]
                )
            )
        df = self.prepare_columns()
        df = self.data.select(
            [
                column
                for column in self.data.columns
                if column in ["partitionId", "partitionKey", "body"]
            ]
        )
        query = (
            df.writeStream.trigger(**TRIGGER_OPTION)
            .format("eventhubs")
            .options(**self.options)
            .queryName(self.query_name)
            .start()
        )

        if self.query_wait_interval:
            while query.isActive:
                if query.lastProgress:
                    logging.info(query.lastProgress)
                time.sleep(self.query_wait_interval)

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

SparkKafkaDestination

Bases: DestinationInterface

This Spark destination class is used to write batch or streaming data from Kafka. Required and optional configurations can be found in the Attributes tables below.

Additionally, there are more optional configurations which can be found here.

For compatibility between Spark and Kafka, the columns in the input dataframe are concatenated into one 'value' column of JSON string.

Example

from rtdip_sdk.pipelines.destinations import SparkKafkaDestination

kafka_destination = SparkKafkaDestination(
    data=df,
    options={
        "kafka.bootstrap.servers": "host1:port1,host2:port2"
    },
    trigger="10 seconds",
    query_name="KafkaDestination",
    query_wait_interval=None
)

kafka_destination.write_stream()

OR

kafka_destination.write_batch()

Parameters:

Name Type Description Default
data DataFrame

Dataframe to be written to Kafka

required
options dict

A dictionary of Kafka configurations (See Attributes tables below). For more information on configuration options see here

required
trigger optional str

Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds

'10 seconds'
query_name str

Unique name for the query in associated SparkSession

'KafkaDestination'
query_wait_interval optional int

If set, waits for the streaming query to complete before returning. (stream) Default is None

None

The following options must be set for the Kafka destination for both batch and streaming queries.

Attributes:

Name Type Description
kafka.bootstrap.servers A comma-separated list of host︰port

The Kafka "bootstrap.servers" configuration. (Streaming and Batch)

The following configurations are optional:

Attributes:

Name Type Description
topic str

Sets the topic that all rows will be written to in Kafka. This option overrides any topic column that may exist in the data. (Streaming and Batch)

includeHeaders bool

Whether to include the Kafka headers in the row. (Streaming and Batch)

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kafka.py
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
class SparkKafkaDestination(DestinationInterface):
    """
    This Spark destination class is used to write batch or streaming data from Kafka. Required and optional configurations can be found in the Attributes tables below.

    Additionally, there are more optional configurations which can be found [here.](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html){ target="_blank" }

    For compatibility between Spark and Kafka, the columns in the input dataframe are concatenated into one 'value' column of JSON string.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.destinations import SparkKafkaDestination

    kafka_destination = SparkKafkaDestination(
        data=df,
        options={
            "kafka.bootstrap.servers": "host1:port1,host2:port2"
        },
        trigger="10 seconds",
        query_name="KafkaDestination",
        query_wait_interval=None
    )

    kafka_destination.write_stream()

    OR

    kafka_destination.write_batch()
    ```

    Parameters:
        data (DataFrame): Dataframe to be written to Kafka
        options (dict): A dictionary of Kafka configurations (See Attributes tables below). For more information on configuration options see [here](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html){ target="_blank" }
        trigger (optional str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds
        query_name (str): Unique name for the query in associated SparkSession
        query_wait_interval (optional int): If set, waits for the streaming query to complete before returning. (stream) Default is None

    The following options must be set for the Kafka destination for both batch and streaming queries.

    Attributes:
        kafka.bootstrap.servers (A comma-separated list of host︰port): The Kafka "bootstrap.servers" configuration. (Streaming and Batch)

    The following configurations are optional:

    Attributes:
        topic (str):Sets the topic that all rows will be written to in Kafka. This option overrides any topic column that may exist in the data. (Streaming and Batch)
        includeHeaders (bool): Whether to include the Kafka headers in the row. (Streaming and Batch)

    """

    data: DataFrame
    options: dict
    trigger: str
    query_name: str
    query_wait_interval: int

    def __init__(
        self,
        data: DataFrame,
        options: dict,
        trigger="10 seconds",
        query_name="KafkaDestination",
        query_wait_interval: int = None,
    ) -> None:
        self.data = data
        self.options = options
        self.trigger = trigger
        self.query_name = query_name
        self.query_wait_interval = query_wait_interval

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        spark_libraries = Libraries()
        spark_libraries.add_maven_library(get_default_package("spark_sql_kafka"))
        return spark_libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_write_validation(self):
        return True

    def post_write_validation(self):
        return True

    def write_batch(self):
        """
        Writes batch data to Kafka.
        """
        try:
            return (
                self.data.select(to_json(struct("*")).alias("value"))
                .write.format("kafka")
                .options(**self.options)
                .save()
            )

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

    def write_stream(self):
        """
        Writes steaming data to Kafka.
        """
        try:
            TRIGGER_OPTION = (
                {"availableNow": True}
                if self.trigger == "availableNow"
                else {"processingTime": self.trigger}
            )
            query = (
                self.data.select(to_json(struct("*")).alias("value"))
                .writeStream.trigger(**TRIGGER_OPTION)
                .format("kafka")
                .options(**self.options)
                .queryName(self.query_name)
                .start()
            )

            if self.query_wait_interval:
                while query.isActive:
                    if query.lastProgress:
                        logging.info(query.lastProgress)
                    time.sleep(self.query_wait_interval)

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kafka.py
 96
 97
 98
 99
100
101
102
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

write_batch()

Writes batch data to Kafka.

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kafka.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def write_batch(self):
    """
    Writes batch data to Kafka.
    """
    try:
        return (
            self.data.select(to_json(struct("*")).alias("value"))
            .write.format("kafka")
            .options(**self.options)
            .save()
        )

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

write_stream()

Writes steaming data to Kafka.

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kafka.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def write_stream(self):
    """
    Writes steaming data to Kafka.
    """
    try:
        TRIGGER_OPTION = (
            {"availableNow": True}
            if self.trigger == "availableNow"
            else {"processingTime": self.trigger}
        )
        query = (
            self.data.select(to_json(struct("*")).alias("value"))
            .writeStream.trigger(**TRIGGER_OPTION)
            .format("kafka")
            .options(**self.options)
            .queryName(self.query_name)
            .start()
        )

        if self.query_wait_interval:
            while query.isActive:
                if query.lastProgress:
                    logging.info(query.lastProgress)
                time.sleep(self.query_wait_interval)

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

SparkKinesisDestination

Bases: DestinationInterface

This Kinesis destination class is used to write batch or streaming data to Kinesis. Kinesis configurations need to be specified as options in a dictionary.

Example

from rtdip_sdk.pipelines.destinations import SparkKinesisDestination

kinesis_destination = SparkKinesisDestination(
    data=df,
    options={
        "endpointUrl": "https://kinesis.{REGION}.amazonaws.com",
        "awsAccessKey": "{YOUR-AWS-ACCESS-KEY}",
        "awsSecretKey": "{YOUR-AWS-SECRET-KEY}",
        "streamName": "{YOUR-STREAM-NAME}"
    },
    mode="update",
    trigger="10 seconds",
    query_name="KinesisDestination",
    query_wait_interval=None
)

kinesis_destination.write_stream()

OR

kinesis_destination.write_batch()

Parameters:

Name Type Description Default
data DataFrame

Dataframe to be written to Delta

required
options dict

A dictionary of Kinesis configurations (See Attributes table below). All Configuration options for Kinesis can be found here.

required
mode str

Method of writing to Kinesis - append, complete, update

'update'
trigger optional str

Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds

'10 seconds'
query_name str

Unique name for the query in associated SparkSession

'KinesisDestination'
query_wait_interval optional int

If set, waits for the streaming query to complete before returning. (stream) Default is None

None

Attributes:

Name Type Description
endpointUrl str

Endpoint of the kinesis stream.

awsAccessKey str

AWS access key.

awsSecretKey str

AWS secret access key corresponding to the access key.

streamName List[str]

Name of the streams in Kinesis to write to.

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kinesis.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
class SparkKinesisDestination(DestinationInterface):
    """
    This Kinesis destination class is used to write batch or streaming data to Kinesis. Kinesis configurations need to be specified as options in a dictionary.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.destinations import SparkKinesisDestination

    kinesis_destination = SparkKinesisDestination(
        data=df,
        options={
            "endpointUrl": "https://kinesis.{REGION}.amazonaws.com",
            "awsAccessKey": "{YOUR-AWS-ACCESS-KEY}",
            "awsSecretKey": "{YOUR-AWS-SECRET-KEY}",
            "streamName": "{YOUR-STREAM-NAME}"
        },
        mode="update",
        trigger="10 seconds",
        query_name="KinesisDestination",
        query_wait_interval=None
    )

    kinesis_destination.write_stream()

    OR

    kinesis_destination.write_batch()
    ```

    Parameters:
        data (DataFrame): Dataframe to be written to Delta
        options (dict): A dictionary of Kinesis configurations (See Attributes table below). All Configuration options for Kinesis can be found [here.](https://github.com/qubole/kinesis-sql#kinesis-sink-configuration){ target="_blank" }
        mode (str): Method of writing to Kinesis - append, complete, update
        trigger (optional str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds
        query_name (str): Unique name for the query in associated SparkSession
        query_wait_interval (optional int): If set, waits for the streaming query to complete before returning. (stream) Default is None

    Attributes:
        endpointUrl (str): Endpoint of the kinesis stream.
        awsAccessKey (str): AWS access key.
        awsSecretKey (str): AWS secret access key corresponding to the access key.
        streamName (List[str]): Name of the streams in Kinesis to write to.
    """

    data: DataFrame
    options: dict
    mode: str
    trigger: str
    query_name: str
    query_wait_interval: int

    def __init__(
        self,
        data: DataFrame,
        options: dict,
        mode: str = "update",
        trigger: str = "10 seconds",
        query_name="KinesisDestination",
        query_wait_interval: int = None,
    ) -> None:
        self.data = data
        self.options = options
        self.mode = mode
        self.trigger = trigger
        self.query_name = query_name
        self.query_wait_interval = query_wait_interval

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK_DATABRICKS
        """
        return SystemType.PYSPARK_DATABRICKS

    @staticmethod
    def libraries():
        spark_libraries = Libraries()
        return spark_libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_write_validation(self):
        return True

    def post_write_validation(self):
        return True

    def write_batch(self):
        """
        Writes batch data to Kinesis.
        """
        try:
            return self.data.write.format("kinesis").options(**self.options).save()
        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

    def write_stream(self):
        """
        Writes steaming data to Kinesis.
        """
        try:
            TRIGGER_OPTION = (
                {"availableNow": True}
                if self.trigger == "availableNow"
                else {"processingTime": self.trigger}
            )
            query = (
                self.data.writeStream.trigger(**TRIGGER_OPTION)
                .format("kinesis")
                .outputMode(self.mode)
                .options(**self.options)
                .queryName(self.query_name)
                .start()
            )

            if self.query_wait_interval:
                while query.isActive:
                    if query.lastProgress:
                        logging.info(query.lastProgress)
                    time.sleep(self.query_wait_interval)

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK_DATABRICKS

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kinesis.py
91
92
93
94
95
96
97
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK_DATABRICKS
    """
    return SystemType.PYSPARK_DATABRICKS

write_batch()

Writes batch data to Kinesis.

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kinesis.py
114
115
116
117
118
119
120
121
122
123
124
125
def write_batch(self):
    """
    Writes batch data to Kinesis.
    """
    try:
        return self.data.write.format("kinesis").options(**self.options).save()
    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

write_stream()

Writes steaming data to Kinesis.

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kinesis.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def write_stream(self):
    """
    Writes steaming data to Kinesis.
    """
    try:
        TRIGGER_OPTION = (
            {"availableNow": True}
            if self.trigger == "availableNow"
            else {"processingTime": self.trigger}
        )
        query = (
            self.data.writeStream.trigger(**TRIGGER_OPTION)
            .format("kinesis")
            .outputMode(self.mode)
            .options(**self.options)
            .queryName(self.query_name)
            .start()
        )

        if self.query_wait_interval:
            while query.isActive:
                if query.lastProgress:
                    logging.info(query.lastProgress)
                time.sleep(self.query_wait_interval)

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

SparkRestAPIDestination

Bases: DestinationInterface

The Spark Rest API Destination is used to write data to a Rest API.

The payload sent to the API is constructed by converting each row in the DataFrame to Json.

Note

While it is possible to use the write_batch method, it is easy to overwhlem a Rest API with large volumes of data. Consider reducing data volumes when writing to a Rest API in Batch mode to prevent API errors including throtting.

Example

#Rest API Destination for Streaming Queries

from rtdip_sdk.pipelines.destinations import SparkRestAPIDestination

rest_api_destination = SparkRestAPIDestination(
    data=df,
    options={
        "checkpointLocation": "{/CHECKPOINT-LOCATION/}"
    },
    url="{REST-API-URL}",
    headers = {
        'Authorization': 'Bearer {}'.format("{TOKEN}")
    },
    batch_size=100,
    method="POST",
    parallelism=8,
    trigger="1 minute",
    query_name="DeltaRestAPIDestination",
    query_wait_interval=None
)

rest_api_destination.write_stream()
#Rest API Destination for Batch Queries

from rtdip_sdk.pipelines.destinations import SparkRestAPIDestination

rest_api_destination = SparkRestAPIDestination(
    data=df,
    options={},
    url="{REST-API-URL}",
    headers = {
        'Authorization': 'Bearer {}'.format("{TOKEN}")
    },
    batch_size=10,
    method="POST",
    parallelism=4,
    trigger="1 minute",
    query_name="DeltaRestAPIDestination",
    query_wait_interval=None
)

rest_api_destination.write_stream()

Parameters:

Name Type Description Default
data DataFrame

Dataframe to be merged into a Delta Table

required
options dict

A dictionary of options for streaming writes

required
url str

The Rest API Url

required
headers dict

A dictionary of headers to be provided to the Rest API

required
batch_size int

The number of DataFrame rows to be used in each Rest API call

required
method str

The method to be used when calling the Rest API. Allowed values are POST, PATCH and PUT

'POST'
parallelism int

The number of concurrent calls to be made to the Rest API

8
trigger optional str

Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds

'1 minutes'
query_name str

Unique name for the query in associated SparkSession

'DeltaRestAPIDestination'
query_wait_interval optional int

If set, waits for the streaming query to complete before returning. (stream) Default is None

None

Attributes:

Name Type Description
checkpointLocation str

Path to checkpoint files. (Streaming)

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/rest_api.py
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
class SparkRestAPIDestination(DestinationInterface):
    """
    The Spark Rest API Destination is used to write data to a Rest API.

    The payload sent to the API is constructed by converting each row in the DataFrame to Json.

    !!! Note
        While it is possible to use the `write_batch` method, it is easy to overwhlem a Rest API with large volumes of data.
        Consider reducing data volumes when writing to a Rest API in Batch mode to prevent API errors including throtting.

    Example
    --------
    ```python
    #Rest API Destination for Streaming Queries

    from rtdip_sdk.pipelines.destinations import SparkRestAPIDestination

    rest_api_destination = SparkRestAPIDestination(
        data=df,
        options={
            "checkpointLocation": "{/CHECKPOINT-LOCATION/}"
        },
        url="{REST-API-URL}",
        headers = {
            'Authorization': 'Bearer {}'.format("{TOKEN}")
        },
        batch_size=100,
        method="POST",
        parallelism=8,
        trigger="1 minute",
        query_name="DeltaRestAPIDestination",
        query_wait_interval=None
    )

    rest_api_destination.write_stream()
    ```
    ```python
    #Rest API Destination for Batch Queries

    from rtdip_sdk.pipelines.destinations import SparkRestAPIDestination

    rest_api_destination = SparkRestAPIDestination(
        data=df,
        options={},
        url="{REST-API-URL}",
        headers = {
            'Authorization': 'Bearer {}'.format("{TOKEN}")
        },
        batch_size=10,
        method="POST",
        parallelism=4,
        trigger="1 minute",
        query_name="DeltaRestAPIDestination",
        query_wait_interval=None
    )

    rest_api_destination.write_stream()
    ```

    Parameters:
        data (DataFrame): Dataframe to be merged into a Delta Table
        options (dict): A dictionary of options for streaming writes
        url (str): The Rest API Url
        headers (dict): A dictionary of headers to be provided to the Rest API
        batch_size (int): The number of DataFrame rows to be used in each Rest API call
        method (str): The method to be used when calling the Rest API. Allowed values are POST, PATCH and PUT
        parallelism (int): The number of concurrent calls to be made to the Rest API
        trigger (optional str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds
        query_name (str): Unique name for the query in associated SparkSession
        query_wait_interval (optional int): If set, waits for the streaming query to complete before returning. (stream) Default is None

    Attributes:
        checkpointLocation (str): Path to checkpoint files. (Streaming)
    """

    data: DataFrame
    options: dict
    url: str
    headers: dict
    batch_size: int
    method: str
    parallelism: int
    trigger: str
    query_name: str
    query_wait_interval: int

    def __init__(
        self,
        data: DataFrame,
        options: dict,
        url: str,
        headers: dict,
        batch_size: int,
        method: str = "POST",
        parallelism: int = 8,
        trigger="1 minutes",
        query_name: str = "DeltaRestAPIDestination",
        query_wait_interval: int = None,
    ) -> None:
        self.data = data
        self.options = options
        self.url = url
        self.headers = headers
        self.batch_size = batch_size
        self.method = method
        self.parallelism = parallelism
        self.trigger = trigger
        self.query_name = query_name
        self.query_wait_interval = query_wait_interval

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        libraries.add_pypi_library(get_default_package("api_requests"))
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_write_validation(self):
        return True

    def post_write_validation(self):
        return True

    def _pre_batch_records_for_api_call(self, micro_batch_df: DataFrame):
        batch_count = math.ceil(micro_batch_df.count() / self.batch_size)
        micro_batch_df = (
            micro_batch_df.withColumn("content", to_json(struct(col("*"))))
            .withColumn("row_number", row_number().over(Window().orderBy(lit("A"))))
            .withColumn("batch_id", col("row_number") % batch_count)
        )
        return micro_batch_df.groupBy("batch_id").agg(
            concat_ws(",|", collect_list("content")).alias("payload")
        )

    def _api_micro_batch(self, micro_batch_df: DataFrame, epoch_id=None):  # NOSONAR
        url = self.url
        method = self.method
        headers = self.headers

        @udf("string")
        def _rest_api_execute(data):
            session = requests.Session()
            adapter = HTTPAdapter(max_retries=3)
            session.mount("http://", adapter)  # NOSONAR
            session.mount("https://", adapter)

            if method == "POST":
                response = session.post(url, headers=headers, data=data, verify=False)
            elif method == "PATCH":
                response = session.patch(url, headers=headers, data=data, verify=False)
            elif method == "PUT":
                response = session.put(url, headers=headers, data=data, verify=False)
            else:
                raise Exception("Method {} is not supported".format(method))  # NOSONAR

            if not (response.status_code == 200 or response.status_code == 201):
                raise Exception(
                    "Response status : {} .Response message : {}".format(
                        str(response.status_code), response.text
                    )
                )  # NOSONAR

            return str(response.status_code)

        micro_batch_df.persist()
        micro_batch_df = self._pre_batch_records_for_api_call(micro_batch_df)

        micro_batch_df = micro_batch_df.repartition(self.parallelism)

        (
            micro_batch_df.withColumn(
                "rest_api_response_code", _rest_api_execute(micro_batch_df["payload"])
            ).collect()
        )
        micro_batch_df.unpersist()

    def write_batch(self):
        """
        Writes batch data to a Rest API
        """
        try:
            return self._api_micro_batch(self.data)

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

    def write_stream(self):
        """
        Writes streaming data to a Rest API
        """
        try:
            TRIGGER_OPTION = (
                {"availableNow": True}
                if self.trigger == "availableNow"
                else {"processingTime": self.trigger}
            )
            query = (
                self.data.writeStream.trigger(**TRIGGER_OPTION)
                .foreachBatch(self._api_micro_batch)
                .queryName(self.query_name)
                .outputMode("update")
                .options(**self.options)
                .start()
            )

            if self.query_wait_interval:
                while query.isActive:
                    if query.lastProgress:
                        logging.info(query.lastProgress)
                    time.sleep(self.query_wait_interval)

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/rest_api.py
149
150
151
152
153
154
155
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

write_batch()

Writes batch data to a Rest API

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/rest_api.py
226
227
228
229
230
231
232
233
234
235
236
237
238
def write_batch(self):
    """
    Writes batch data to a Rest API
    """
    try:
        return self._api_micro_batch(self.data)

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

write_stream()

Writes streaming data to a Rest API

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/rest_api.py
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def write_stream(self):
    """
    Writes streaming data to a Rest API
    """
    try:
        TRIGGER_OPTION = (
            {"availableNow": True}
            if self.trigger == "availableNow"
            else {"processingTime": self.trigger}
        )
        query = (
            self.data.writeStream.trigger(**TRIGGER_OPTION)
            .foreachBatch(self._api_micro_batch)
            .queryName(self.query_name)
            .outputMode("update")
            .options(**self.options)
            .start()
        )

        if self.query_wait_interval:
            while query.isActive:
                if query.lastProgress:
                    logging.info(query.lastProgress)
                time.sleep(self.query_wait_interval)

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

SparkPCDMToDeltaDestination

Bases: DestinationInterface

The Process Control Data Model written to Delta.

Example

#PCDM Latest To Delta Destination for Streaming Queries

from rtdip_sdk.pipelines.destinations import SparkPCDMToDeltaDestination

pcdm_to_delta_destination = SparkPCDMToDeltaDestination(
    data=df,
    options={
        "checkpointLocation": "{/CHECKPOINT-LOCATION/}"
    },
    destination_float="{DELTA_TABLE_PATH_FLOAT}",
    destination_string="{DELTA_TABLE_PATH_STRING}",
    destination_integer="{DELTA_TABLE_PATH_INTEGER}",
    mode="append",
    trigger="10 seconds",
    query_name="PCDMToDeltaDestination",
    query_wait_interval=None,
    merge=True,
    try_broadcast_join=False,
    remove_nanoseconds=False,
    remove_duplicates-True
)

pcdm_to_delta_destination.write_stream()
#PCDM Latest To Delta Destination for Batch Queries

from rtdip_sdk.pipelines.destinations import SparkPCDMToDeltaDestination

pcdm_to_delta_destination = SparkPCDMToDeltaDestination(
    data=df,
    options={
        "maxRecordsPerFile", "10000"
    },
    destination_float="{DELTA_TABLE_PATH_FLOAT}",
    destination_string="{DELTA_TABLE_PATH_STRING}",
    destination_integer="{DELTA_TABLE_PATH_INTEGER}",
    mode="overwrite",
    trigger="10 seconds",
    query_name="PCDMToDeltaDestination",
    query_wait_interval=None,
    merge=True,
    try_broadcast_join=False,
    remove_nanoseconds=False,
    remove_duplicates-True
)

pcdm_to_delta_destination.write_batch()

Parameters:

Name Type Description Default
data DataFrame

Dataframe to be merged into a Delta Table

required
options dict

Options that can be specified for a Delta Table read operation (See Attributes table below). Further information on the options is available for batch and streaming.

required
destination_float str

Either the name of the Hive Metastore or Unity Catalog Delta Table or the path to the Delta table to store float values.

required
destination_string Optional str

Either the name of the Hive Metastore or Unity Catalog Delta Table or the path to the Delta table to store string values.

None
destination_integer Optional str

Either the name of the Hive Metastore or Unity Catalog Delta Table or the path to the Delta table to store integer values

None
mode str

Method of writing to Delta Table - append/overwrite (batch), append/complete (stream)

None
trigger optional str

Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds

'10 seconds'
query_name str

Unique name for the query in associated SparkSession

'PCDMToDeltaDestination'
query_wait_interval optional int

If set, waits for the streaming query to complete before returning. (stream) Default is None

None
merge bool

Use Delta Merge to perform inserts, updates and deletes

True
try_broadcast_join bool

Attempts to perform a broadcast join in the merge which can leverage data skipping using partition pruning and file pruning automatically. Can fail if dataframe being merged is large and therefore more suitable for streaming merges than batch merges

False
remove_nanoseconds bool

Removes nanoseconds from the EventTime column and replaces with zeros

False
remove_duplicates bool

Removes duplicates before writing the data

True

Attributes:

Name Type Description
checkpointLocation str

Path to checkpoint files. (Streaming)

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_to_delta.py
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
class SparkPCDMToDeltaDestination(DestinationInterface):
    """
    The Process Control Data Model written to Delta.

    Example
    --------
    ```python
    #PCDM Latest To Delta Destination for Streaming Queries

    from rtdip_sdk.pipelines.destinations import SparkPCDMToDeltaDestination

    pcdm_to_delta_destination = SparkPCDMToDeltaDestination(
        data=df,
        options={
            "checkpointLocation": "{/CHECKPOINT-LOCATION/}"
        },
        destination_float="{DELTA_TABLE_PATH_FLOAT}",
        destination_string="{DELTA_TABLE_PATH_STRING}",
        destination_integer="{DELTA_TABLE_PATH_INTEGER}",
        mode="append",
        trigger="10 seconds",
        query_name="PCDMToDeltaDestination",
        query_wait_interval=None,
        merge=True,
        try_broadcast_join=False,
        remove_nanoseconds=False,
        remove_duplicates-True
    )

    pcdm_to_delta_destination.write_stream()
    ```
    ```python
    #PCDM Latest To Delta Destination for Batch Queries

    from rtdip_sdk.pipelines.destinations import SparkPCDMToDeltaDestination

    pcdm_to_delta_destination = SparkPCDMToDeltaDestination(
        data=df,
        options={
            "maxRecordsPerFile", "10000"
        },
        destination_float="{DELTA_TABLE_PATH_FLOAT}",
        destination_string="{DELTA_TABLE_PATH_STRING}",
        destination_integer="{DELTA_TABLE_PATH_INTEGER}",
        mode="overwrite",
        trigger="10 seconds",
        query_name="PCDMToDeltaDestination",
        query_wait_interval=None,
        merge=True,
        try_broadcast_join=False,
        remove_nanoseconds=False,
        remove_duplicates-True
    )

    pcdm_to_delta_destination.write_batch()
    ```

    Parameters:
        data (DataFrame): Dataframe to be merged into a Delta Table
        options (dict): Options that can be specified for a Delta Table read operation (See Attributes table below). Further information on the options is available for [batch](https://docs.delta.io/latest/delta-batch.html#write-to-a-table){ target="_blank" } and [streaming](https://docs.delta.io/latest/delta-streaming.html#delta-table-as-a-sink){ target="_blank" }.
        destination_float (str): Either the name of the Hive Metastore or Unity Catalog Delta Table **or** the path to the Delta table to store float values.
        destination_string (Optional str): Either the name of the Hive Metastore or Unity Catalog Delta Table **or** the path to the Delta table to store string values.
        destination_integer (Optional str): Either the name of the Hive Metastore or Unity Catalog Delta Table **or** the path to the Delta table to store integer values
        mode (str): Method of writing to Delta Table - append/overwrite (batch), append/complete (stream)
        trigger (optional str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds
        query_name (str): Unique name for the query in associated SparkSession
        query_wait_interval (optional int): If set, waits for the streaming query to complete before returning. (stream) Default is None
        merge (bool): Use Delta Merge to perform inserts, updates and deletes
        try_broadcast_join (bool): Attempts to perform a broadcast join in the merge which can leverage data skipping using partition pruning and file pruning automatically. Can fail if dataframe being merged is large and therefore more suitable for streaming merges than batch merges
        remove_nanoseconds (bool): Removes nanoseconds from the EventTime column and replaces with zeros
        remove_duplicates (bool: Removes duplicates before writing the data

    Attributes:
        checkpointLocation (str): Path to checkpoint files. (Streaming)
    """

    spark: SparkSession
    data: DataFrame
    options: dict
    destination_float: str
    destination_string: str
    destination_integer: str
    mode: str
    trigger: str
    query_name: str
    query_wait_interval: int
    merge: bool
    try_broadcast_join: bool
    remove_nanoseconds: bool
    remove_duplicates: bool

    def __init__(
        self,
        spark: SparkSession,
        data: DataFrame,
        options: dict,
        destination_float: str,
        destination_string: str = None,
        destination_integer: str = None,
        mode: str = None,
        trigger="10 seconds",
        query_name: str = "PCDMToDeltaDestination",
        query_wait_interval: int = None,
        merge: bool = True,
        try_broadcast_join=False,
        remove_nanoseconds: bool = False,
        remove_duplicates: bool = True,
    ) -> None:
        self.spark = spark
        self.data = data
        self.destination_float = destination_float
        self.destination_string = destination_string
        self.destination_integer = destination_integer
        self.options = options
        self.mode = mode
        self.trigger = trigger
        self.query_name = query_name
        self.query_wait_interval = query_wait_interval
        self.merge = merge
        self.try_broadcast_join = try_broadcast_join
        self.remove_nanoseconds = remove_nanoseconds
        self.remove_duplicates = remove_duplicates

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        libraries.add_maven_library(get_default_package("spark_delta_core"))
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_write_validation(self):
        return True

    def post_write_validation(self):
        return True

    def _get_eventdate_string(self, df: DataFrame) -> str:
        dates_df = df.select("EventDate").distinct()
        dates_df = dates_df.select(
            date_format("EventDate", "yyyy-MM-dd").alias("EventDate")
        )
        dates_list = list(dates_df.toPandas()["EventDate"])
        return str(dates_list).replace("[", "").replace("]", "")

    def _write_delta_merge(self, df: DataFrame, destination: str):
        df = df.select(
            "EventDate", "TagName", "EventTime", "Status", "Value", "ChangeType"
        )
        when_matched_update_list = [
            DeltaMergeConditionValues(
                condition="(source.ChangeType IN ('insert', 'update', 'upsert')) AND ((source.Status != target.Status) OR (source.Value != target.Value))",
                values={
                    "EventDate": "source.EventDate",
                    "TagName": "source.TagName",
                    "EventTime": "source.EventTime",
                    "Status": "source.Status",
                    "Value": "source.Value",
                },
            )
        ]
        when_matched_delete_list = [
            DeltaMergeCondition(condition="source.ChangeType = 'delete'")
        ]
        when_not_matched_insert_list = [
            DeltaMergeConditionValues(
                condition="(source.ChangeType IN ('insert', 'update', 'upsert'))",
                values={
                    "EventDate": "source.EventDate",
                    "TagName": "source.TagName",
                    "EventTime": "source.EventTime",
                    "Status": "source.Status",
                    "Value": "source.Value",
                },
            )
        ]

        merge_condition = "source.EventDate = target.EventDate AND source.TagName = target.TagName AND source.EventTime = target.EventTime"

        perform_merge = True
        if self.try_broadcast_join != True:
            eventdate_string = self._get_eventdate_string(df)
            if eventdate_string == None or eventdate_string == "":
                perform_merge = False
            else:
                merge_condition = (
                    "target.EventDate in ({}) AND ".format(eventdate_string)
                    + merge_condition
                )

        if perform_merge == True:
            SparkDeltaMergeDestination(
                spark=self.spark,
                data=df,
                destination=destination,
                options=self.options,
                merge_condition=merge_condition,
                when_matched_update_list=when_matched_update_list,
                when_matched_delete_list=when_matched_delete_list,
                when_not_matched_insert_list=when_not_matched_insert_list,
                try_broadcast_join=self.try_broadcast_join,
                trigger=self.trigger,
                query_name=self.query_name,
            ).write_batch()

    def _write_delta_batch(self, df: DataFrame, destination: str):
        if self.merge == True:
            if "EventDate" not in df.columns:
                df = df.withColumn("EventDate", date_format("EventTime", "yyyy-MM-dd"))

            self._write_delta_merge(
                df.filter(col("ChangeType").isin("insert", "update", "upsert")),
                destination,
            )
            self._write_delta_merge(
                df.filter(col("ChangeType") == "delete"), destination
            )
        else:
            df = df.select("TagName", "EventTime", "Status", "Value")
            SparkDeltaDestination(
                data=df,
                destination=destination,
                options=self.options,
                mode=self.mode,
                trigger=self.trigger,
                query_name=self.query_name,
            ).write_batch()

    def _write_data_by_type(self, df: DataFrame):
        if self.merge == True:
            df = df.withColumn(
                "ChangeType",
                when(df["ChangeType"].isin("insert", "update"), "upsert").otherwise(
                    df["ChangeType"]
                ),
            )

        if self.remove_nanoseconds == True:
            df = df.withColumn(
                "EventTime",
                (floor(col("EventTime").cast("double") * 1000) / 1000).cast(
                    "timestamp"
                ),
            )

        if self.remove_duplicates == True:
            df = df.drop_duplicates(["TagName", "EventTime", "ChangeType"])

        float_df = df.filter(ValueTypeConstants.FLOAT_VALUE).withColumn(
            "Value", col("Value").cast("float")
        )
        self._write_delta_batch(float_df, self.destination_float)

        if self.destination_string != None:
            string_df = df.filter(ValueTypeConstants.STRING_VALUE)
            self._write_delta_batch(string_df, self.destination_string)

        if self.destination_integer != None:
            integer_df = df.filter(ValueTypeConstants.INTEGER_VALUE).withColumn(
                "Value", col("Value").cast("integer")
            )
            self._write_delta_batch(integer_df, self.destination_integer)

    def _write_stream_microbatches(self, df: DataFrame, epoch_id=None):  # NOSONAR
        df.persist()
        self._write_data_by_type(df)
        df.unpersist()

    def write_batch(self):
        """
        Writes Process Control Data Model data to Delta
        """
        try:
            if self.try_broadcast_join != True:
                self.data.persist()

            self._write_data_by_type(self.data)

            if self.try_broadcast_join != True:
                self.data.unpersist()

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

    def write_stream(self):
        """
        Writes streaming Process Control Data Model data to Delta using foreachBatch
        """
        try:
            TRIGGER_OPTION = (
                {"availableNow": True}
                if self.trigger == "availableNow"
                else {"processingTime": self.trigger}
            )
            if self.merge == True:
                query = (
                    self.data.writeStream.trigger(**TRIGGER_OPTION)
                    .format("delta")
                    .foreachBatch(self._write_stream_microbatches)
                    .queryName(self.query_name)
                    .outputMode("update")
                    .options(**self.options)
                    .start()
                )
            else:
                default_checkpoint_location = None
                float_checkpoint_location = None
                string_checkpoint_location = None
                integer_checkpoint_location = None

                append_options = self.options.copy()
                if "checkpointLocation" in self.options:
                    default_checkpoint_location = self.options["checkpointLocation"]
                    if default_checkpoint_location[-1] != "/":
                        default_checkpoint_location += "/"
                    float_checkpoint_location = default_checkpoint_location + "float"
                    string_checkpoint_location = default_checkpoint_location + "string"
                    integer_checkpoint_location = (
                        default_checkpoint_location + "integer"
                    )

                if float_checkpoint_location is not None:
                    append_options["checkpointLocation"] = float_checkpoint_location

                delta_float = SparkDeltaDestination(
                    data=self.data.select("TagName", "EventTime", "Status", "Value")
                    .filter(ValueTypeConstants.FLOAT_VALUE)
                    .withColumn("Value", col("Value").cast("float")),
                    destination=self.destination_float,
                    options=append_options,
                    mode=self.mode,
                    trigger=self.trigger,
                    query_name=self.query_name + "_float",
                )

                delta_float.write_stream()

                if self.destination_string != None:
                    if string_checkpoint_location is not None:
                        append_options["checkpointLocation"] = (
                            string_checkpoint_location
                        )

                    delta_string = SparkDeltaDestination(
                        data=self.data.select(
                            "TagName", "EventTime", "Status", "Value"
                        ).filter(ValueTypeConstants.STRING_VALUE),
                        destination=self.destination_string,
                        options=append_options,
                        mode=self.mode,
                        trigger=self.trigger,
                        query_name=self.query_name + "_string",
                    )

                    delta_string.write_stream()

                if self.destination_integer != None:
                    if integer_checkpoint_location is not None:
                        append_options["checkpointLocation"] = (
                            integer_checkpoint_location
                        )

                    delta_integer = SparkDeltaDestination(
                        data=self.data.select("TagName", "EventTime", "Status", "Value")
                        .filter(ValueTypeConstants.INTEGER_VALUE)
                        .withColumn("Value", col("Value").cast("integer")),
                        destination=self.destination_integer,
                        options=append_options,
                        mode=self.mode,
                        trigger=self.trigger,
                        query_name=self.query_name + "_integer",
                    )

                    delta_integer.write_stream()

                if self.query_wait_interval:
                    while self.spark.streams.active != []:
                        for query in self.spark.streams.active:
                            if query.lastProgress:
                                logging.info(
                                    "{}: {}".format(query.name, query.lastProgress)
                                )
                        time.sleep(self.query_wait_interval)

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_to_delta.py
161
162
163
164
165
166
167
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

write_batch()

Writes Process Control Data Model data to Delta

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_to_delta.py
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
def write_batch(self):
    """
    Writes Process Control Data Model data to Delta
    """
    try:
        if self.try_broadcast_join != True:
            self.data.persist()

        self._write_data_by_type(self.data)

        if self.try_broadcast_join != True:
            self.data.unpersist()

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

write_stream()

Writes streaming Process Control Data Model data to Delta using foreachBatch

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_to_delta.py
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
def write_stream(self):
    """
    Writes streaming Process Control Data Model data to Delta using foreachBatch
    """
    try:
        TRIGGER_OPTION = (
            {"availableNow": True}
            if self.trigger == "availableNow"
            else {"processingTime": self.trigger}
        )
        if self.merge == True:
            query = (
                self.data.writeStream.trigger(**TRIGGER_OPTION)
                .format("delta")
                .foreachBatch(self._write_stream_microbatches)
                .queryName(self.query_name)
                .outputMode("update")
                .options(**self.options)
                .start()
            )
        else:
            default_checkpoint_location = None
            float_checkpoint_location = None
            string_checkpoint_location = None
            integer_checkpoint_location = None

            append_options = self.options.copy()
            if "checkpointLocation" in self.options:
                default_checkpoint_location = self.options["checkpointLocation"]
                if default_checkpoint_location[-1] != "/":
                    default_checkpoint_location += "/"
                float_checkpoint_location = default_checkpoint_location + "float"
                string_checkpoint_location = default_checkpoint_location + "string"
                integer_checkpoint_location = (
                    default_checkpoint_location + "integer"
                )

            if float_checkpoint_location is not None:
                append_options["checkpointLocation"] = float_checkpoint_location

            delta_float = SparkDeltaDestination(
                data=self.data.select("TagName", "EventTime", "Status", "Value")
                .filter(ValueTypeConstants.FLOAT_VALUE)
                .withColumn("Value", col("Value").cast("float")),
                destination=self.destination_float,
                options=append_options,
                mode=self.mode,
                trigger=self.trigger,
                query_name=self.query_name + "_float",
            )

            delta_float.write_stream()

            if self.destination_string != None:
                if string_checkpoint_location is not None:
                    append_options["checkpointLocation"] = (
                        string_checkpoint_location
                    )

                delta_string = SparkDeltaDestination(
                    data=self.data.select(
                        "TagName", "EventTime", "Status", "Value"
                    ).filter(ValueTypeConstants.STRING_VALUE),
                    destination=self.destination_string,
                    options=append_options,
                    mode=self.mode,
                    trigger=self.trigger,
                    query_name=self.query_name + "_string",
                )

                delta_string.write_stream()

            if self.destination_integer != None:
                if integer_checkpoint_location is not None:
                    append_options["checkpointLocation"] = (
                        integer_checkpoint_location
                    )

                delta_integer = SparkDeltaDestination(
                    data=self.data.select("TagName", "EventTime", "Status", "Value")
                    .filter(ValueTypeConstants.INTEGER_VALUE)
                    .withColumn("Value", col("Value").cast("integer")),
                    destination=self.destination_integer,
                    options=append_options,
                    mode=self.mode,
                    trigger=self.trigger,
                    query_name=self.query_name + "_integer",
                )

                delta_integer.write_stream()

            if self.query_wait_interval:
                while self.spark.streams.active != []:
                    for query in self.spark.streams.active:
                        if query.lastProgress:
                            logging.info(
                                "{}: {}".format(query.name, query.lastProgress)
                            )
                    time.sleep(self.query_wait_interval)

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

SparkPCDMLatestToDeltaDestination

Bases: DestinationInterface

The Process Control Data Model Latest Values written to Delta.

Example

#PCDM Latest To Delta Destination for Streaming Queries

from rtdip_sdk.pipelines.destinations import SparkPCDMLatestToDeltaDestination

pcdm_latest_to_delta_destination = SparkPCDMLatestToDeltaDestination(
    data=df,
    options={
        "checkpointLocation": "{/CHECKPOINT-LOCATION/}"
    },
    destination="{DELTA_TABLE_PATH}",
    mode="append",
    trigger="10 seconds",
    query_name="PCDMLatestToDeltaDestination",
    query_wait_interval=None
)

pcdm_latest_to_delta_destination.write_stream()
#PCDM Latest To Delta Destination for Batch Queries

from rtdip_sdk.pipelines.destinations import SparkPCDMLatestToDeltaDestination

pcdm_latest_to_delta_destination = SparkPCDMLatestToDeltaDestination(
    data=df,
    options={
        "maxRecordsPerFile", "10000"
    },
    destination="{DELTA_TABLE_PATH}",
    mode="overwrite",
    trigger="10 seconds",
    query_name="PCDMLatestToDeltaDestination",
    query_wait_interval=None
)

pcdm_latest_to_delta_destination.write_batch()

Parameters:

Name Type Description Default
data DataFrame

Dataframe to be merged into a Delta Table

required
options dict

Options that can be specified for a Delta Table read operation (See Attributes table below). Further information on the options is available for batch and streaming.

required
destination str

Either the name of the Hive Metastore or Unity Catalog Delta Table or the path to the Delta table to store the latest values

required
mode str

Method of writing to Delta Table - append/overwrite (batch), append/complete (stream)

None
trigger optional str

Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds

'10 seconds'
query_name str

Unique name for the query in associated SparkSession

'PCDMLatestToDeltaDestination'
query_wait_interval optional int

If set, waits for the streaming query to complete before returning. (stream) Default is None

None

Attributes:

Name Type Description
checkpointLocation str

Path to checkpoint files. (Streaming)

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_latest_to_delta.py
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
class SparkPCDMLatestToDeltaDestination(DestinationInterface):
    """
    The Process Control Data Model Latest Values written to Delta.

    Example
    --------
    ```python
    #PCDM Latest To Delta Destination for Streaming Queries

    from rtdip_sdk.pipelines.destinations import SparkPCDMLatestToDeltaDestination

    pcdm_latest_to_delta_destination = SparkPCDMLatestToDeltaDestination(
        data=df,
        options={
            "checkpointLocation": "{/CHECKPOINT-LOCATION/}"
        },
        destination="{DELTA_TABLE_PATH}",
        mode="append",
        trigger="10 seconds",
        query_name="PCDMLatestToDeltaDestination",
        query_wait_interval=None
    )

    pcdm_latest_to_delta_destination.write_stream()
    ```
    ```python
    #PCDM Latest To Delta Destination for Batch Queries

    from rtdip_sdk.pipelines.destinations import SparkPCDMLatestToDeltaDestination

    pcdm_latest_to_delta_destination = SparkPCDMLatestToDeltaDestination(
        data=df,
        options={
            "maxRecordsPerFile", "10000"
        },
        destination="{DELTA_TABLE_PATH}",
        mode="overwrite",
        trigger="10 seconds",
        query_name="PCDMLatestToDeltaDestination",
        query_wait_interval=None
    )

    pcdm_latest_to_delta_destination.write_batch()
    ```

    Parameters:
        data (DataFrame): Dataframe to be merged into a Delta Table
        options (dict): Options that can be specified for a Delta Table read operation (See Attributes table below). Further information on the options is available for [batch](https://docs.delta.io/latest/delta-batch.html#write-to-a-table){ target="_blank" } and [streaming](https://docs.delta.io/latest/delta-streaming.html#delta-table-as-a-sink){ target="_blank" }.
        destination (str): Either the name of the Hive Metastore or Unity Catalog Delta Table **or** the path to the Delta table to store the latest values
        mode (str): Method of writing to Delta Table - append/overwrite (batch), append/complete (stream)
        trigger (optional str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds
        query_name (str): Unique name for the query in associated SparkSession
        query_wait_interval (optional int): If set, waits for the streaming query to complete before returning. (stream) Default is None

    Attributes:
        checkpointLocation (str): Path to checkpoint files. (Streaming)
    """

    spark: SparkSession
    data: DataFrame
    options: dict
    destination: str
    mode: str
    trigger: str
    query_name: str
    query_wait_interval: int

    def __init__(
        self,
        spark: SparkSession,
        data: DataFrame,
        options: dict,
        destination: str,
        mode: str = None,
        trigger="10 seconds",
        query_name: str = "PCDMLatestToDeltaDestination",
        query_wait_interval: int = None,
    ) -> None:
        self.spark = spark
        self.data = data
        self.destination = destination
        self.options = options
        self.mode = mode
        self.trigger = trigger
        self.query_name = query_name
        self.query_wait_interval = query_wait_interval

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        libraries.add_maven_library(get_default_package("spark_delta_core"))
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_write_validation(self):
        return True

    def post_write_validation(self):
        return True

    def _write_latest_to_delta(self, df: DataFrame, epoch_id=None):  # NOSONAR
        df.persist()

        latest_df = (
            df.withColumn(
                "Latest",
                max(struct("EventTime", "Status")).over(Window.partitionBy("TagName")),
            )
            .withColumn(
                "GoodLatest",
                when(
                    col("Latest.Status") == "Good",
                    struct(col("EventTime"), col("Value"), col("ValueType")),
                ).otherwise(
                    max(
                        when(
                            col("Status") == "Good",
                            struct("EventTime", "Value", "ValueType"),
                        )
                    ).over(Window.partitionBy("TagName"))
                ),
            )
            .filter(col("EventTime") == col("Latest.EventTime"))
            .drop("Latest")
            .dropDuplicates(["TagName"])
        )

        when_matched_update_list = [
            DeltaMergeConditionValues(
                condition="source.EventTime > target.EventTime AND (source.GoodLatest.EventTime IS NULL OR source.GoodLatest.EventTime <= target.GoodEventTime)",
                values={
                    "EventTime": "source.EventTime",
                    "Status": "source.Status",
                    "Value": "source.Value",
                    "ValueType": "source.ValueType",
                },
            ),
            DeltaMergeConditionValues(
                condition="source.EventTime > target.EventTime AND (source.GoodLatest.EventTime IS NOT NULL AND (source.GoodLatest.EventTime > target.GoodEventTime OR target.GoodEventTime IS NULL))",
                values={
                    "EventTime": "source.EventTime",
                    "Status": "source.Status",
                    "Value": "source.Value",
                    "ValueType": "source.ValueType",
                    "GoodEventTime": "source.GoodLatest.EventTime",
                    "GoodValue": "source.GoodLatest.Value",
                    "GoodValueType": "source.GoodLatest.ValueType",
                },
            ),
            DeltaMergeConditionValues(
                condition="source.EventTime <= target.EventTime AND (source.GoodLatest.EventTime IS NOT NULL AND (source.GoodLatest.EventTime > target.GoodEventTime OR target.GoodEventTime IS NULL))",
                values={
                    "GoodEventTime": "source.GoodLatest.EventTime",
                    "GoodValue": "source.GoodLatest.Value",
                    "GoodValueType": "source.GoodLatest.ValueType",
                },
            ),
        ]

        when_not_matched_insert_list = [
            DeltaMergeConditionValues(
                values={
                    "TagName": "source.TagName",
                    "EventTime": "source.EventTime",
                    "Status": "source.Status",
                    "Value": "source.Value",
                    "ValueType": "source.ValueType",
                    "GoodEventTime": "source.GoodLatest.EventTime",
                    "GoodValue": "source.GoodLatest.Value",
                    "GoodValueType": "source.GoodLatest.ValueType",
                },
            )
        ]

        merge_condition = "source.TagName = target.TagName"

        SparkDeltaMergeDestination(
            spark=self.spark,
            data=latest_df,
            destination=self.destination,
            options=self.options,
            merge_condition=merge_condition,
            when_matched_update_list=when_matched_update_list,
            when_not_matched_insert_list=when_not_matched_insert_list,
            trigger=self.trigger,
            query_name=self.query_name,
        ).write_batch()

        df.unpersist()

    def write_batch(self):
        """
        Writes Process Control Data Model data to Delta
        """
        try:
            self._write_latest_to_delta(self.data)

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

    def write_stream(self):
        """
        Writes streaming Process Control Data Model data to Delta using foreachBatch
        """
        try:
            TRIGGER_OPTION = (
                {"availableNow": True}
                if self.trigger == "availableNow"
                else {"processingTime": self.trigger}
            )

            query = (
                self.data.writeStream.trigger(**TRIGGER_OPTION)
                .format("delta")
                .foreachBatch(self._write_latest_to_delta)
                .queryName(self.query_name)
                .outputMode("append")
                .options(**self.options)
                .start()
            )

            if self.query_wait_interval:
                while query.isActive:
                    if query.lastProgress:
                        logging.info(query.lastProgress)
                    time.sleep(self.query_wait_interval)

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_latest_to_delta.py
126
127
128
129
130
131
132
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

write_batch()

Writes Process Control Data Model data to Delta

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_latest_to_delta.py
240
241
242
243
244
245
246
247
248
249
250
251
252
def write_batch(self):
    """
    Writes Process Control Data Model data to Delta
    """
    try:
        self._write_latest_to_delta(self.data)

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

write_stream()

Writes streaming Process Control Data Model data to Delta using foreachBatch

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_latest_to_delta.py
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
def write_stream(self):
    """
    Writes streaming Process Control Data Model data to Delta using foreachBatch
    """
    try:
        TRIGGER_OPTION = (
            {"availableNow": True}
            if self.trigger == "availableNow"
            else {"processingTime": self.trigger}
        )

        query = (
            self.data.writeStream.trigger(**TRIGGER_OPTION)
            .format("delta")
            .foreachBatch(self._write_latest_to_delta)
            .queryName(self.query_name)
            .outputMode("append")
            .options(**self.options)
            .start()
        )

        if self.query_wait_interval:
            while query.isActive:
                if query.lastProgress:
                    logging.info(query.lastProgress)
                time.sleep(self.query_wait_interval)

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

SparkKafkaEventhubDestination

Bases: DestinationInterface

This Spark Destination class is used to write batch or streaming data to an Eventhub using the Kafka protocol. This enables Eventhubs to be used as a destination in applications like Delta Live Tables or Databricks Serverless Jobs as the Spark Eventhubs JAR is not supported in these scenarios.

Default settings will be specified if not provided in the options parameter:

  • kafka.sasl.mechanism will be set to PLAIN
  • kafka.security.protocol will be set to SASL_SSL
  • kafka.request.timeout.ms will be set to 60000
  • kafka.session.timeout.ms will be set to 60000

Example

from rtdip_sdk.pipelines.destinations import SparkKafkaEventhubDestination
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

connectionString = Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}

eventhub_destination = SparkKafkaEventhubDestination(
    spark=spark,
    data=df,
    options={
        "kafka.bootstrap.servers": "host1:port1,host2:port2"
    },
    connection_string="{YOUR-EVENTHUB-CONNECTION-STRING}",
    consumer_group="{YOUR-EVENTHUB-CONSUMER-GROUP}",
    trigger="10 seconds",
    query_name="KafkaEventhubDestination",
    query_wait_interval=None
)

eventhub_destination.write_stream()

OR

eventhub_destination.write_batch()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session

required
data DataFrame

Any columns not listed in the required schema here will be merged into a single column named "value", or ignored if "value" is an existing column

required
connection_string str

Eventhubs connection string is required to connect to the Eventhubs service. This must include the Eventhub name as the EntityPath parameter. Example "Endpoint=sb://test.servicebus.windows.net/;SharedAccessKeyName=test;SharedAccessKey=test_key;EntityPath=test_eventhub"

required
options dict

A dictionary of Kafka configurations (See Attributes tables below)

required
consumer_group str

The Eventhub consumer group to use for the connection

required
trigger optional str

Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds

'10 seconds'
query_name optional str

Unique name for the query in associated SparkSession

'KafkaEventhubDestination'
query_wait_interval optional int

If set, waits for the streaming query to complete before returning. (stream) Default is None

None

The following are commonly used parameters that may be included in the options dict. kafka.bootstrap.servers is the only required config. A full list of configs can be found here

Attributes:

Name Type Description
kafka.bootstrap.servers A comma-separated list of host︰port

The Kafka "bootstrap.servers" configuration. (Streaming and Batch)

topic string

Required if there is no existing topic column in your DataFrame. Sets the topic that all rows will be written to in Kafka. (Streaming and Batch)

includeHeaders bool

Determines whether to include the Kafka headers in the row; defaults to False. (Streaming and Batch)

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kafka_eventhub.py
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
class SparkKafkaEventhubDestination(DestinationInterface):
    """
    This Spark Destination class is used to write batch or streaming data to an Eventhub using the Kafka protocol. This enables Eventhubs to be used as a destination in applications like Delta Live Tables or Databricks Serverless Jobs as the Spark Eventhubs JAR is not supported in these scenarios.

    Default settings will be specified if not provided in the `options` parameter:

    - `kafka.sasl.mechanism` will be set to `PLAIN`
    - `kafka.security.protocol` will be set to `SASL_SSL`
    - `kafka.request.timeout.ms` will be set to `60000`
    - `kafka.session.timeout.ms` will be set to `60000`

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.destinations import SparkKafkaEventhubDestination
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    connectionString = Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}

    eventhub_destination = SparkKafkaEventhubDestination(
        spark=spark,
        data=df,
        options={
            "kafka.bootstrap.servers": "host1:port1,host2:port2"
        },
        connection_string="{YOUR-EVENTHUB-CONNECTION-STRING}",
        consumer_group="{YOUR-EVENTHUB-CONSUMER-GROUP}",
        trigger="10 seconds",
        query_name="KafkaEventhubDestination",
        query_wait_interval=None
    )

    eventhub_destination.write_stream()

    OR

    eventhub_destination.write_batch()
    ```

    Parameters:
        spark (SparkSession): Spark Session
        data (DataFrame): Any columns not listed in the required schema [here](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html#writing-data-to-kafka){ target="_blank" } will be merged into a single column named "value", or ignored if "value" is an existing column
        connection_string (str): Eventhubs connection string is required to connect to the Eventhubs service. This must include the Eventhub name as the `EntityPath` parameter. Example `"Endpoint=sb://test.servicebus.windows.net/;SharedAccessKeyName=test;SharedAccessKey=test_key;EntityPath=test_eventhub"`
        options (dict): A dictionary of Kafka configurations (See Attributes tables below)
        consumer_group (str): The Eventhub consumer group to use for the connection
        trigger (optional str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds
        query_name (optional str): Unique name for the query in associated SparkSession
        query_wait_interval (optional int): If set, waits for the streaming query to complete before returning. (stream) Default is None

    The following are commonly used parameters that may be included in the options dict. kafka.bootstrap.servers is the only required config. A full list of configs can be found [here](https://kafka.apache.org/documentation/#producerconfigs){ target="_blank" }

    Attributes:
        kafka.bootstrap.servers (A comma-separated list of host︰port):  The Kafka "bootstrap.servers" configuration. (Streaming and Batch)
        topic (string): Required if there is no existing topic column in your DataFrame. Sets the topic that all rows will be written to in Kafka. (Streaming and Batch)
        includeHeaders (bool): Determines whether to include the Kafka headers in the row; defaults to False. (Streaming and Batch)
    """

    spark: SparkSession
    data: DataFrame
    connection_string: str
    options: dict
    consumer_group: str
    trigger: str
    query_name: str
    connection_string_properties: dict
    query_wait_interval: int

    def __init__(
        self,
        spark: SparkSession,
        data: DataFrame,
        connection_string: str,
        options: dict,
        consumer_group: str,
        trigger: str = "10 seconds",
        query_name: str = "KafkaEventhubDestination",
        query_wait_interval: int = None,
    ) -> None:
        self.spark = spark
        self.data = data
        self.connection_string = connection_string
        self.options = options
        self.consumer_group = consumer_group
        self.trigger = trigger
        self.query_name = query_name
        self.connection_string_properties = self._parse_connection_string(
            connection_string
        )
        self.options = self._configure_options(options)
        self.query_wait_interval = query_wait_interval

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        spark_libraries = Libraries()
        spark_libraries.add_maven_library(get_default_package("spark_sql_kafka"))
        return spark_libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_write_validation(self) -> bool:
        return True

    def post_write_validation(self) -> bool:
        return True

    # Code is from Azure Eventhub Python SDK. Will import the package if possible with Conda in the  conda-forge channel in the future
    def _parse_connection_string(self, connection_string: str):
        conn_settings = [s.split("=", 1) for s in connection_string.split(";")]
        if any(len(tup) != 2 for tup in conn_settings):
            raise ValueError("Connection string is either blank or malformed.")
        conn_settings = dict(conn_settings)
        shared_access_signature = None
        for key, value in conn_settings.items():
            if key.lower() == "sharedaccesssignature":
                shared_access_signature = value
        shared_access_key = conn_settings.get("SharedAccessKey")
        shared_access_key_name = conn_settings.get("SharedAccessKeyName")
        if any([shared_access_key, shared_access_key_name]) and not all(
            [shared_access_key, shared_access_key_name]
        ):
            raise ValueError(
                "Connection string must have both SharedAccessKeyName and SharedAccessKey."
            )
        if shared_access_signature is not None and shared_access_key is not None:
            raise ValueError(
                "Only one of the SharedAccessKey or SharedAccessSignature must be present."
            )
        endpoint = conn_settings.get("Endpoint")
        if not endpoint:
            raise ValueError("Connection string is either blank or malformed.")
        parsed = urlparse(endpoint.rstrip("/"))
        if not parsed.netloc:
            raise ValueError("Invalid Endpoint on the Connection String.")
        namespace = parsed.netloc.strip()
        properties = {
            "fully_qualified_namespace": namespace,
            "endpoint": endpoint,
            "eventhub_name": conn_settings.get("EntityPath"),
            "shared_access_signature": shared_access_signature,
            "shared_access_key_name": shared_access_key_name,
            "shared_access_key": shared_access_key,
        }
        return properties

    def _connection_string_builder(self, properties: dict) -> str:
        connection_string = "Endpoint=" + properties.get("endpoint") + ";"

        if properties.get("shared_access_key"):
            connection_string += (
                "SharedAccessKey=" + properties.get("shared_access_key") + ";"
            )

        if properties.get("shared_access_key_name"):
            connection_string += (
                "SharedAccessKeyName=" + properties.get("shared_access_key_name") + ";"
            )

        if properties.get("shared_access_signature"):
            connection_string += (
                "SharedAccessSignature="
                + properties.get("shared_access_signature")
                + ";"
            )
        return connection_string

    def _configure_options(self, options: dict) -> dict:
        if "topic" not in options:
            options["topic"] = self.connection_string_properties.get("eventhub_name")

        if "kafka.bootstrap.servers" not in options:
            options["kafka.bootstrap.servers"] = (
                self.connection_string_properties.get("fully_qualified_namespace")
                + ":9093"
            )

        if "kafka.sasl.mechanism" not in options:
            options["kafka.sasl.mechanism"] = "PLAIN"

        if "kafka.security.protocol" not in options:
            options["kafka.security.protocol"] = "SASL_SSL"

        if "kafka.sasl.jaas.config" not in options:
            kafka_package = "org.apache.kafka.common.security.plain.PlainLoginModule"
            if "DATABRICKS_RUNTIME_VERSION" in os.environ or (
                "_client" in self.spark.__dict__
                and "databricks" in self.spark.client.host
            ):
                kafka_package = "kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule"
            connection_string = self._connection_string_builder(
                self.connection_string_properties
            )
            options["kafka.sasl.jaas.config"] = (
                '{} required username="$ConnectionString" password="{}";'.format(
                    kafka_package, connection_string
                )
            )  # NOSONAR

        if "kafka.request.timeout.ms" not in options:
            options["kafka.request.timeout.ms"] = "60000"

        if "kafka.session.timeout.ms" not in options:
            options["kafka.session.timeout.ms"] = "60000"

        if "kafka.group.id" not in options:
            options["kafka.group.id"] = self.consumer_group

        options["includeHeaders"] = "true"

        return options

    def _transform_to_eventhub_schema(self, df: DataFrame) -> DataFrame:
        column_list = ["key", "headers", "topic", "partition"]
        if "value" not in df.columns:
            df = df.withColumn(
                "value",
                to_json(
                    struct(
                        [
                            col(column).alias(column)
                            for column in df.columns
                            if column not in column_list
                        ]
                    )
                ),
            )
        if "headers" in df.columns and (
            df.schema["headers"].dataType.elementType["key"].nullable == True
            or df.schema["headers"].dataType.elementType["value"].nullable == True
        ):
            raise ValueError("key and value in the headers column cannot be nullable")

        return df.select(
            [
                column
                for column in df.columns
                if column in ["value", "key", "headers", "topic", "partition"]
            ]
        )

    def write_batch(self) -> DataFrame:
        """
        Reads batch data from Kafka.
        """
        try:
            df = self._transform_to_eventhub_schema(self.data)
            df.write.format("kafka").options(**self.options).save()

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

    def write_stream(self) -> DataFrame:
        """
        Reads streaming data from Kafka.
        """
        try:
            df = self._transform_to_eventhub_schema(self.data)
            TRIGGER_OPTION = (
                {"availableNow": True}
                if self.trigger == "availableNow"
                else {"processingTime": self.trigger}
            )
            query = (
                df.writeStream.trigger(**TRIGGER_OPTION)
                .format("kafka")
                .options(**self.options)
                .queryName(self.query_name)
                .start()
            )

            if self.query_wait_interval:
                while query.isActive:
                    if query.lastProgress:
                        logging.info(query.lastProgress)
                    time.sleep(self.query_wait_interval)

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kafka_eventhub.py
131
132
133
134
135
136
137
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

write_batch()

Reads batch data from Kafka.

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kafka_eventhub.py
289
290
291
292
293
294
295
296
297
298
299
300
301
302
def write_batch(self) -> DataFrame:
    """
    Reads batch data from Kafka.
    """
    try:
        df = self._transform_to_eventhub_schema(self.data)
        df.write.format("kafka").options(**self.options).save()

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

write_stream()

Reads streaming data from Kafka.

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kafka_eventhub.py
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
def write_stream(self) -> DataFrame:
    """
    Reads streaming data from Kafka.
    """
    try:
        df = self._transform_to_eventhub_schema(self.data)
        TRIGGER_OPTION = (
            {"availableNow": True}
            if self.trigger == "availableNow"
            else {"processingTime": self.trigger}
        )
        query = (
            df.writeStream.trigger(**TRIGGER_OPTION)
            .format("kafka")
            .options(**self.options)
            .queryName(self.query_name)
            .start()
        )

        if self.query_wait_interval:
            while query.isActive:
                if query.lastProgress:
                    logging.info(query.lastProgress)
                time.sleep(self.query_wait_interval)

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

EVMContractDestination

Bases: DestinationInterface

The EVM Contract Destination is used to write to a smart contract blockchain.

Examples

from rtdip_sdk.pipelines.destinations import EVMContractDestination

evm_contract_destination = EVMContractDestination(
    url="https://polygon-mumbai.g.alchemy.com/v2/⟨API_KEY⟩",
    account="{ACCOUNT-ADDRESS}",
    private_key="{PRIVATE-KEY}",
    abi="{SMART-CONTRACT'S-ABI}",
    contract="{SMART-CONTRACT-ADDRESS}",
    function_name="{SMART-CONTRACT-FUNCTION}",
    function_params=({PARAMETER_1}, {PARAMETER_2}, {PARAMETER_3}),
    transaction={'gas': {GAS}, 'gasPrice': {GAS-PRICE}},
)

evm_contract_destination.write_batch()

Parameters:

Name Type Description Default
url str

Blockchain network URL e.g. 'https://polygon-mumbai.g.alchemy.com/v2/⟨API_KEY⟩'

required
account str

Address of the sender that will be signing the transaction.

required
private_key str

Private key for your blockchain account.

required
abi json str

Smart contract's ABI.

required
contract str

Address of the smart contract.

None
function_name str

Smart contract method to call on.

None
function_params tuple

Parameters of given function.

None
transaction dict

A dictionary containing a set of instructions to interact with a smart contract deployed on the blockchain (See common parameters in Attributes table below).

None

Attributes:

Name Type Description
data hexadecimal str

Additional information store in the transaction.

from hexadecimal str

Address of sender for a transaction.

gas int

Amount of gas units to perform a transaction.

gasPrice int Wei

Price to pay for each unit of gas. Integers are specified in Wei, web3's to_wei function can be used to specify the amount in a different currency.

nonce int

The number of transactions sent from a given address.

to hexadecimal str

Address of recipient for a transaction.

value int Wei

Value being transferred in a transaction. Integers are specified in Wei, web3's to_wei function can be used to specify the amount in a different currency.

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/blockchain/evm.py
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
class EVMContractDestination(DestinationInterface):
    """
    The EVM Contract Destination is used to write to a smart contract blockchain.

    Examples
    --------
    ```python
    from rtdip_sdk.pipelines.destinations import EVMContractDestination

    evm_contract_destination = EVMContractDestination(
        url="https://polygon-mumbai.g.alchemy.com/v2/⟨API_KEY⟩",
        account="{ACCOUNT-ADDRESS}",
        private_key="{PRIVATE-KEY}",
        abi="{SMART-CONTRACT'S-ABI}",
        contract="{SMART-CONTRACT-ADDRESS}",
        function_name="{SMART-CONTRACT-FUNCTION}",
        function_params=({PARAMETER_1}, {PARAMETER_2}, {PARAMETER_3}),
        transaction={'gas': {GAS}, 'gasPrice': {GAS-PRICE}},
    )

    evm_contract_destination.write_batch()
    ```

    Parameters:
        url (str): Blockchain network URL e.g. 'https://polygon-mumbai.g.alchemy.com/v2/⟨API_KEY⟩'
        account (str): Address of the sender that will be signing the transaction.
        private_key (str): Private key for your blockchain account.
        abi (json str): Smart contract's ABI.
        contract (str): Address of the smart contract.
        function_name (str): Smart contract method to call on.
        function_params (tuple): Parameters of given function.
        transaction (dict): A dictionary containing a set of instructions to interact with a smart contract deployed on the blockchain (See common parameters in Attributes table below).

    Attributes:
        data (hexadecimal str): Additional information store in the transaction.
        from (hexadecimal str): Address of sender for a transaction.
        gas (int): Amount of gas units to perform a transaction.
        gasPrice (int Wei): Price to pay for each unit of gas. Integers are specified in Wei, web3's to_wei function can be used to specify the amount in a different currency.
        nonce (int): The number of transactions sent from a given address.
        to (hexadecimal str): Address of recipient for a transaction.
        value (int Wei): Value being transferred in a transaction. Integers are specified in Wei, web3's to_wei function can be used to specify the amount in a different currency.
    """

    url: str
    account: str
    private_key: str
    abi: str
    contract: str
    function_name: str
    function_params: tuple
    transaction: dict

    def __init__(
        self,
        url: str,
        account: str,
        private_key: str,
        abi: str,
        contract: str = None,
        function_name: str = None,
        function_params: tuple = None,
        transaction: dict = None,
    ) -> None:
        self.url = url
        self.account = account
        self.private_key = private_key
        self.abi = json.loads(abi)
        self.contract = contract
        self.function_name = function_name
        self.function_params = function_params
        self.transaction = transaction
        self.web3 = Web3(Web3.HTTPProvider(self.url))

    @staticmethod
    def system_type():
        return SystemType.PYTHON

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_write_validation(self) -> bool:
        return True

    def post_write_validation(self) -> bool:
        return True

    def _process_transaction(self):
        if "nonce" not in self.transaction.keys():
            nonce = self.web3.eth.get_transaction_count(self.account)
            self.transaction["nonce"] = nonce
        if "from" not in self.transaction.keys():
            self.transaction["from"] = self.account

    def write_batch(self) -> str:
        """
        Writes to a smart contract deployed in a blockchain and returns the transaction hash.

        Example:
        ```
        from web3 import Web3

        web3 = Web3(Web3.HTTPProvider("https://polygon-mumbai.g.alchemy.com/v2/<API_KEY>"))

        x = EVMContractDestination(
                            url="https://polygon-mumbai.g.alchemy.com/v2/<API_KEY>",
                            account='<ACCOUNT>',
                            private_key='<PRIVATE_KEY>',
                            contract='<CONTRACT>',
                            function_name='transferFrom',
                            function_params=('<FROM_ACCOUNT>', '<TO_ACCOUNT>', 0),
                            abi = 'ABI',
                            transaction={
                                'gas': 100000,
                                'gasPrice': 1000000000 # or web3.to_wei('1', 'gwei')
                                },
                            )

        print(x.write_batch())
        ```
        """
        contract = self.web3.eth.contract(address=self.contract, abi=self.abi)

        self._process_transaction()
        tx = contract.functions[self.function_name](
            *self.function_params
        ).build_transaction(self.transaction)

        signed_tx = self.web3.eth.account.sign_transaction(tx, self.private_key)
        tx_hash = self.web3.eth.send_raw_transaction(signed_tx.rawTransaction)
        self.web3.eth.wait_for_transaction_receipt(tx_hash)

        return str(self.web3.to_hex(tx_hash))

    def write_stream(self):
        """
        Raises:
            NotImplementedError: Write stream is not supported.
        """
        raise NotImplementedError("EVMContractDestination only supports batch writes.")

write_batch()

Writes to a smart contract deployed in a blockchain and returns the transaction hash.

Example:

from web3 import Web3

web3 = Web3(Web3.HTTPProvider("https://polygon-mumbai.g.alchemy.com/v2/<API_KEY>"))

x = EVMContractDestination(
                    url="https://polygon-mumbai.g.alchemy.com/v2/<API_KEY>",
                    account='<ACCOUNT>',
                    private_key='<PRIVATE_KEY>',
                    contract='<CONTRACT>',
                    function_name='transferFrom',
                    function_params=('<FROM_ACCOUNT>', '<TO_ACCOUNT>', 0),
                    abi = 'ABI',
                    transaction={
                        'gas': 100000,
                        'gasPrice': 1000000000 # or web3.to_wei('1', 'gwei')
                        },
                    )

print(x.write_batch())

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/blockchain/evm.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
def write_batch(self) -> str:
    """
    Writes to a smart contract deployed in a blockchain and returns the transaction hash.

    Example:
    ```
    from web3 import Web3

    web3 = Web3(Web3.HTTPProvider("https://polygon-mumbai.g.alchemy.com/v2/<API_KEY>"))

    x = EVMContractDestination(
                        url="https://polygon-mumbai.g.alchemy.com/v2/<API_KEY>",
                        account='<ACCOUNT>',
                        private_key='<PRIVATE_KEY>',
                        contract='<CONTRACT>',
                        function_name='transferFrom',
                        function_params=('<FROM_ACCOUNT>', '<TO_ACCOUNT>', 0),
                        abi = 'ABI',
                        transaction={
                            'gas': 100000,
                            'gasPrice': 1000000000 # or web3.to_wei('1', 'gwei')
                            },
                        )

    print(x.write_batch())
    ```
    """
    contract = self.web3.eth.contract(address=self.contract, abi=self.abi)

    self._process_transaction()
    tx = contract.functions[self.function_name](
        *self.function_params
    ).build_transaction(self.transaction)

    signed_tx = self.web3.eth.account.sign_transaction(tx, self.private_key)
    tx_hash = self.web3.eth.send_raw_transaction(signed_tx.rawTransaction)
    self.web3.eth.wait_for_transaction_receipt(tx_hash)

    return str(self.web3.to_hex(tx_hash))

write_stream()

Raises:

Type Description
NotImplementedError

Write stream is not supported.

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/blockchain/evm.py
160
161
162
163
164
165
def write_stream(self):
    """
    Raises:
        NotImplementedError: Write stream is not supported.
    """
    raise NotImplementedError("EVMContractDestination only supports batch writes.")

PythonDeltaDestination

Bases: DestinationInterface

The Python Delta Destination is used to write data to a Delta table from a Polars LazyFrame.

Example

from rtdip_sdk.pipelines.destinations import PythonDeltaDestination

path = "abfss://{FILE-SYSTEM}@{ACCOUNT-NAME}.dfs.core.windows.net/{PATH}/{FILE-NAME}

python_delta_destination = PythonDeltaDestination(
    data=LazyFrame
    path=path,
    storage_options={
        "azure_storage_account_name": "{AZURE-STORAGE-ACCOUNT-NAME}",
        "azure_storage_account_key": "{AZURE-STORAGE-ACCOUNT-KEY}"
    },
    mode=:error",
    overwrite_schema=False,
    delta_write_options=None
)

python_delta_destination.read_batch()
from rtdip_sdk.pipelines.destinations import PythonDeltaDestination

path = "https://s3.{REGION-CODE}.amazonaws.com/{BUCKET-NAME}/{KEY-NAME}"

python_delta_destination = PythonDeltaDestination(
    data=LazyFrame
    path=path,
    options={
        "aws_access_key_id": "{AWS-ACCESS-KEY-ID}",
        "aws_secret_access_key": "{AWS-SECRET-ACCESS-KEY}"
    },
    mode=:error",
    overwrite_schema=False,
    delta_write_options=None
)

python_delta_destination.read_batch()

Parameters:

Name Type Description Default
data LazyFrame

Polars LazyFrame to be written to Delta

required
path str

Path to Delta table to be written to; either local or remote. Locally if the Table does't exist one will be created, but to write to AWS or Azure, you must have an existing Delta Table

required
options Optional dict

Used if writing to a remote location. For AWS use format {"aws_access_key_id": "<>", "aws_secret_access_key": "<>"}. For Azure use format {"azure_storage_account_name": "storageaccountname", "azure_storage_access_key": "<>"}

None
mode Literal['error', 'append', 'overwrite', 'ignore']

Defaults to error if table exists, 'ignore' won't write anything if table exists

'error'
overwrite_schema bool

If True will allow for the table schema to be overwritten

False
delta_write_options dict

Options when writing to a Delta table. See here for all options

None
Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/python/delta.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
class PythonDeltaDestination(DestinationInterface):
    """
    The Python Delta Destination is used to write data to a Delta table from a Polars LazyFrame.

     Example
    --------
    === "Azure"

        ```python
        from rtdip_sdk.pipelines.destinations import PythonDeltaDestination

        path = "abfss://{FILE-SYSTEM}@{ACCOUNT-NAME}.dfs.core.windows.net/{PATH}/{FILE-NAME}

        python_delta_destination = PythonDeltaDestination(
            data=LazyFrame
            path=path,
            storage_options={
                "azure_storage_account_name": "{AZURE-STORAGE-ACCOUNT-NAME}",
                "azure_storage_account_key": "{AZURE-STORAGE-ACCOUNT-KEY}"
            },
            mode=:error",
            overwrite_schema=False,
            delta_write_options=None
        )

        python_delta_destination.read_batch()

        ```
    === "AWS"

        ```python
        from rtdip_sdk.pipelines.destinations import PythonDeltaDestination

        path = "https://s3.{REGION-CODE}.amazonaws.com/{BUCKET-NAME}/{KEY-NAME}"

        python_delta_destination = PythonDeltaDestination(
            data=LazyFrame
            path=path,
            options={
                "aws_access_key_id": "{AWS-ACCESS-KEY-ID}",
                "aws_secret_access_key": "{AWS-SECRET-ACCESS-KEY}"
            },
            mode=:error",
            overwrite_schema=False,
            delta_write_options=None
        )

        python_delta_destination.read_batch()
        ```

    Parameters:
        data (LazyFrame): Polars LazyFrame to be written to Delta
        path (str): Path to Delta table to be written to; either local or [remote](https://delta-io.github.io/delta-rs/python/usage.html#loading-a-delta-table){ target="_blank" }. **Locally** if the Table does't exist one will be created, but to write to AWS or Azure, you must have an existing Delta Table
        options (Optional dict): Used if writing to a remote location. For AWS use format {"aws_access_key_id": "<>", "aws_secret_access_key": "<>"}. For Azure use format {"azure_storage_account_name": "storageaccountname", "azure_storage_access_key": "<>"}
        mode (Literal['error', 'append', 'overwrite', 'ignore']): Defaults to error if table exists, 'ignore' won't write anything if table exists
        overwrite_schema (bool): If True will allow for the table schema to be overwritten
        delta_write_options (dict): Options when writing to a Delta table. See [here](https://delta-io.github.io/delta-rs/python/api_reference.html#writing-deltatables){ target="_blank" } for all options
    """

    data: LazyFrame
    path: str
    options: dict
    mode: Literal["error", "append", "overwrite", "ignore"]
    overwrite_schema: bool
    delta_write_options: dict

    def __init__(
        self,
        data: LazyFrame,
        path: str,
        options: dict = None,
        mode: Literal["error", "append", "overwrite", "ignore"] = "error",
        overwrite_schema: bool = False,
        delta_write_options: dict = None,
    ) -> None:
        self.data = data
        self.path = path
        self.options = options
        self.mode = mode
        self.overwrite_schema = overwrite_schema
        self.delta_write_options = delta_write_options

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYTHON
        """
        return SystemType.PYTHON

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_write_validation(self):
        return True

    def post_write_validation(self):
        return True

    def write_batch(self):
        """
        Writes batch data to Delta without using Spark.
        """
        if isinstance(self.data, pl.LazyFrame):
            df = self.data.collect()
            df.write_delta(
                self.path,
                mode=self.mode,
                overwrite_schema=self.overwrite_schema,
                storage_options=self.options,
                delta_write_options=self.delta_write_options,
            )
        else:
            raise ValueError(
                "Data must be a Polars LazyFrame. See https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html"
            )

    def write_stream(self):
        """
        Raises:
            NotImplementedError: Writing to a Delta table using Python is only possible for batch writes. To perform a streaming read, use the write_stream method of the SparkDeltaDestination component.
        """
        raise NotImplementedError(
            "Writing to a Delta table using Python is only possible for batch writes. To perform a streaming read, use the write_stream method of the SparkDeltaDestination component"
        )

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYTHON

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/python/delta.py
105
106
107
108
109
110
111
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYTHON
    """
    return SystemType.PYTHON

write_batch()

Writes batch data to Delta without using Spark.

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/python/delta.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def write_batch(self):
    """
    Writes batch data to Delta without using Spark.
    """
    if isinstance(self.data, pl.LazyFrame):
        df = self.data.collect()
        df.write_delta(
            self.path,
            mode=self.mode,
            overwrite_schema=self.overwrite_schema,
            storage_options=self.options,
            delta_write_options=self.delta_write_options,
        )
    else:
        raise ValueError(
            "Data must be a Polars LazyFrame. See https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html"
        )

write_stream()

Raises:

Type Description
NotImplementedError

Writing to a Delta table using Python is only possible for batch writes. To perform a streaming read, use the write_stream method of the SparkDeltaDestination component.

Source code in src/sdk/python/rtdip_sdk/pipelines/destinations/python/delta.py
146
147
148
149
150
151
152
153
def write_stream(self):
    """
    Raises:
        NotImplementedError: Writing to a Delta table using Python is only possible for batch writes. To perform a streaming read, use the write_stream method of the SparkDeltaDestination component.
    """
    raise NotImplementedError(
        "Writing to a Delta table using Python is only possible for batch writes. To perform a streaming read, use the write_stream method of the SparkDeltaDestination component"
    )

DatabricksSecrets

Bases: SecretsInterface

Retrieves secrets from Databricks Secret Scopes. For more information about Databricks Secret Scopes, see here.

Example

# Reads Secrets from Databricks Secret Scopes

from rtdip_sdk.pipelines.secrets import DatabricksSecrets
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

get_databricks_secret = DatabricksSecrets(
    spark=spark,
    vault="{NAME-OF-DATABRICKS-SECRET-SCOPE}"
    key="{KEY-NAME-OF-SECRET}",
)

get_databricks_secret.get()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session required to read data from a Delta table

required
vault str

Name of the Databricks Secret Scope

required
key str

Name/Key of the secret in the Databricks Secret Scope

required
Source code in src/sdk/python/rtdip_sdk/pipelines/secrets/databricks.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
class DatabricksSecrets(SecretsInterface):
    """
    Retrieves secrets from Databricks Secret Scopes. For more information about Databricks Secret Scopes, see [here.](https://docs.databricks.com/security/secrets/secret-scopes.html)

    Example
    -------
    ```python
    # Reads Secrets from Databricks Secret Scopes

    from rtdip_sdk.pipelines.secrets import DatabricksSecrets
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    get_databricks_secret = DatabricksSecrets(
        spark=spark,
        vault="{NAME-OF-DATABRICKS-SECRET-SCOPE}"
        key="{KEY-NAME-OF-SECRET}",
    )

    get_databricks_secret.get()
    ```

    Parameters:
        spark: Spark Session required to read data from a Delta table
        vault: Name of the Databricks Secret Scope
        key: Name/Key of the secret in the Databricks Secret Scope
    """

    spark: SparkSession
    vault: str
    key: str

    def __init__(self, spark: SparkSession, vault: str, key: str):
        self.spark = spark
        self.vault = vault
        self.key = key

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK on Databricks
        """
        return SystemType.PYSPARK_DATABRICKS

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def get(self):
        """
        Retrieves the secret from the Databricks Secret Scope
        """
        dbutils = get_dbutils(self.spark)
        return dbutils.secrets.get(scope=self.vault, key=self.key)

    def set(self):
        """
        Sets the secret in the Secret Scope
        Raises:
            NotImplementedError: Will be implemented at a later point in time
        """
        return NotImplementedError

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK on Databricks

Source code in src/sdk/python/rtdip_sdk/pipelines/secrets/databricks.py
60
61
62
63
64
65
66
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK on Databricks
    """
    return SystemType.PYSPARK_DATABRICKS

get()

Retrieves the secret from the Databricks Secret Scope

Source code in src/sdk/python/rtdip_sdk/pipelines/secrets/databricks.py
77
78
79
80
81
82
def get(self):
    """
    Retrieves the secret from the Databricks Secret Scope
    """
    dbutils = get_dbutils(self.spark)
    return dbutils.secrets.get(scope=self.vault, key=self.key)

set()

Sets the secret in the Secret Scope Raises: NotImplementedError: Will be implemented at a later point in time

Source code in src/sdk/python/rtdip_sdk/pipelines/secrets/databricks.py
84
85
86
87
88
89
90
def set(self):
    """
    Sets the secret in the Secret Scope
    Raises:
        NotImplementedError: Will be implemented at a later point in time
    """
    return NotImplementedError

HashiCorpVaultSecrets

Bases: SecretsInterface

Retrieves and creates/updates secrets in a Hashicorp Vault. For more information about Hashicorp Vaults, see here.

Example

# Retrieves Secrets from HashiCorp Vault

from rtdip_sdk.pipelines.secrets import HashiCorpVaultSecrets

get_hashicorp_secret = HashiCorpVaultSecrets(
    vault="http://127.0.0.1:8200",
    key="{KEY}",
    secret=None,
    credential="{CREDENTIAL}",
    kwargs=None
)

get_hashicorp_secret.get()
# Creates or Updates Secrets in Hashicorp Vault

from rtdip_sdk.pipelines.secrets import HashiCorpVaultSecrets

set_hashicorp_secret = AzureKeyVaultSecrets(
    vault="http://127.0.0.1:8200",
    key="{KEY}",
    secret="{SECRET-TO-BE-SET}",
    credential="{CREDENTIAL}",
    kwargs=None
)

set_hashicorp_secret.set()

Parameters:

Name Type Description Default
vault str

Hashicorp Vault URL

required
key str

Name/Key of the secret in the Hashicorp Vault

required
secret str

Secret or Password to be stored in the Hashicorp Vault

None
credential str

Token for authentication with the Hashicorp Vault

None
kwargs dict

List of additional parameters to be passed when creating a Hashicorp Vault Client. Please see here for more details on parameters that can be provided to the client

{}
Source code in src/sdk/python/rtdip_sdk/pipelines/secrets/hashicorp_vault.py
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
class HashiCorpVaultSecrets(SecretsInterface):
    """
    Retrieves and creates/updates secrets in a Hashicorp Vault. For more information about Hashicorp Vaults, see [here.](https://developer.hashicorp.com/vault/docs/get-started/developer-qs)

    Example
    -------
    ```python
    # Retrieves Secrets from HashiCorp Vault

    from rtdip_sdk.pipelines.secrets import HashiCorpVaultSecrets

    get_hashicorp_secret = HashiCorpVaultSecrets(
        vault="http://127.0.0.1:8200",
        key="{KEY}",
        secret=None,
        credential="{CREDENTIAL}",
        kwargs=None
    )

    get_hashicorp_secret.get()

    ```
    ```python
    # Creates or Updates Secrets in Hashicorp Vault

    from rtdip_sdk.pipelines.secrets import HashiCorpVaultSecrets

    set_hashicorp_secret = AzureKeyVaultSecrets(
        vault="http://127.0.0.1:8200",
        key="{KEY}",
        secret="{SECRET-TO-BE-SET}",
        credential="{CREDENTIAL}",
        kwargs=None
    )

    set_hashicorp_secret.set()
    ```

    Parameters:
        vault (str): Hashicorp Vault URL
        key (str): Name/Key of the secret in the Hashicorp Vault
        secret (str): Secret or Password to be stored in the Hashicorp Vault
        credential (str): Token for authentication with the Hashicorp Vault
        kwargs (dict): List of additional parameters to be passed when creating a Hashicorp Vault Client. Please see [here](https://hvac.readthedocs.io/en/stable/overview.html#initialize-the-client) for more details on parameters that can be provided to the client
    """

    vault: str
    key: str
    secret: str
    credential: str

    def __init__(
        self,
        vault: str,
        key: str,
        secret: str = None,
        credential: str = None,
        kwargs: dict = {},
    ):  # NOSONAR
        self.vault = vault
        self.key = key
        self.secret = secret
        self.credential = credential
        self.kwargs = kwargs
        self.client = self._get_hvac_client()

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYTHON
        """
        return SystemType.PYTHON

    @staticmethod
    def libraries():
        libraries = Libraries()
        libraries.add_pypi_library(get_default_package("hashicorp_vault"))
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def _get_hvac_client(self):
        return hvac.Client(url=self.vault, token=self.credential, **self.kwargs)

    def get(self):
        """
        Retrieves the secret from the Hashicorp Vault
        """
        response = self.client.secrets.kv.read_secret_version(path=self.key)
        return response["data"]["data"]["password"]

    def set(self):
        """
        Creates or updates a secret in the Hashicorp Vault
        """
        self.client.secrets.kv.v2.create_or_update_secret(
            path=self.key,
            secret=dict(password=self.secret),
        )
        return True

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYTHON

Source code in src/sdk/python/rtdip_sdk/pipelines/secrets/hashicorp_vault.py
87
88
89
90
91
92
93
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYTHON
    """
    return SystemType.PYTHON

get()

Retrieves the secret from the Hashicorp Vault

Source code in src/sdk/python/rtdip_sdk/pipelines/secrets/hashicorp_vault.py
108
109
110
111
112
113
def get(self):
    """
    Retrieves the secret from the Hashicorp Vault
    """
    response = self.client.secrets.kv.read_secret_version(path=self.key)
    return response["data"]["data"]["password"]

set()

Creates or updates a secret in the Hashicorp Vault

Source code in src/sdk/python/rtdip_sdk/pipelines/secrets/hashicorp_vault.py
115
116
117
118
119
120
121
122
123
def set(self):
    """
    Creates or updates a secret in the Hashicorp Vault
    """
    self.client.secrets.kv.v2.create_or_update_secret(
        path=self.key,
        secret=dict(password=self.secret),
    )
    return True

AzureKeyVaultSecrets

Bases: SecretsInterface

Retrieves and creates/updates secrets in Azure Key Vault. For more information about Azure Key Vaults, see here.

Example

# Retrieves Secrets from Azure Key Vault

from rtdip_sdk.pipelines.secrets import AzureKeyVaultSecrets

get_key_vault_secret = AzureKeyVaultSecrets(
    vault="https://{YOUR-KEY-VAULT}.azure.net/",
    key="{KEY}",
    secret=None,
    credential="{CREDENTIAL}",
    kwargs=None
)

get_key_vault_secret.get()
# Creates or Updates Secrets in Azure Key Vault

from rtdip_sdk.pipelines.secrets import AzureKeyVaultSecrets

set_key_vault_secret = AzureKeyVaultSecrets(
    vault="https://{YOUR-KEY-VAULT}.azure.net/",
    key="{KEY}",
    secret="{SECRET-TO-BE-SET}",
    credential="{CREDENTIAL}",
    kwargs=None
)

set_key_vault_secret.set()

Parameters:

Name Type Description Default
vault str

Azure Key Vault URL

required
key str

Key for the secret

required
secret str

Secret or Password to be set in the Azure Key Vault

None
credential str

Credential for authenticating with Azure Key Vault

None
kwargs dict

List of additional parameters to be passed when creating a Azure Key Vault Client. Please see here for more details on parameters that can be provided to the client

None
Source code in src/sdk/python/rtdip_sdk/pipelines/secrets/azure_key_vault.py
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
class AzureKeyVaultSecrets(SecretsInterface):
    """
    Retrieves and creates/updates secrets in Azure Key Vault. For more information about Azure Key Vaults, see [here.](https://learn.microsoft.com/en-gb/azure/key-vault/general/overview)

    Example
    -------
    ```python
    # Retrieves Secrets from Azure Key Vault

    from rtdip_sdk.pipelines.secrets import AzureKeyVaultSecrets

    get_key_vault_secret = AzureKeyVaultSecrets(
        vault="https://{YOUR-KEY-VAULT}.azure.net/",
        key="{KEY}",
        secret=None,
        credential="{CREDENTIAL}",
        kwargs=None
    )

    get_key_vault_secret.get()

    ```
    ```python
    # Creates or Updates Secrets in Azure Key Vault

    from rtdip_sdk.pipelines.secrets import AzureKeyVaultSecrets

    set_key_vault_secret = AzureKeyVaultSecrets(
        vault="https://{YOUR-KEY-VAULT}.azure.net/",
        key="{KEY}",
        secret="{SECRET-TO-BE-SET}",
        credential="{CREDENTIAL}",
        kwargs=None
    )

    set_key_vault_secret.set()
    ```

    Parameters:
        vault (str): Azure Key Vault URL
        key (str): Key for the secret
        secret (str): Secret or Password to be set in the Azure Key Vault
        credential (str): Credential for authenticating with Azure Key Vault
        kwargs (dict): List of additional parameters to be passed when creating a Azure Key Vault Client. Please see [here](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/keyvault/azure-keyvault-secrets) for more details on parameters that can be provided to the client
    """

    vault: str
    key: str
    secret: str
    credential: str
    kwargs: dict

    def __init__(
        self,
        vault: str,
        key: str,
        secret: str = None,
        credential=None,
        kwargs: dict = None,
    ):
        self.vault = vault
        self.key = key
        self.secret = secret
        self.credential = credential
        self.kwargs = {} if kwargs is None else kwargs
        self.client = self._get_akv_client()

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYTHON
        """
        return SystemType.PYTHON

    @staticmethod
    def libraries():
        libraries = Libraries()
        libraries.add_pypi_library(get_default_package("azure_key_vault_secret"))
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def _get_akv_client(self):
        return SecretClient(
            vault_url="https://{}.vault.azure.net".format(self.vault),
            credential=self.credential,
            **self.kwargs
        )

    def get(self):
        """
        Retrieves the secret from the Azure Key Vault
        """
        response = self.client.get_secret(name=self.key)
        return response.value

    def set(self):
        """
        Creates or updates a secret in the Azure Key Vault
        """
        self.client.set_secret(name=self.key, value=self.secret)
        return True

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYTHON

Source code in src/sdk/python/rtdip_sdk/pipelines/secrets/azure_key_vault.py
88
89
90
91
92
93
94
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYTHON
    """
    return SystemType.PYTHON

get()

Retrieves the secret from the Azure Key Vault

Source code in src/sdk/python/rtdip_sdk/pipelines/secrets/azure_key_vault.py
113
114
115
116
117
118
def get(self):
    """
    Retrieves the secret from the Azure Key Vault
    """
    response = self.client.get_secret(name=self.key)
    return response.value

set()

Creates or updates a secret in the Azure Key Vault

Source code in src/sdk/python/rtdip_sdk/pipelines/secrets/azure_key_vault.py
120
121
122
123
124
125
def set(self):
    """
    Creates or updates a secret in the Azure Key Vault
    """
    self.client.set_secret(name=self.key, value=self.secret)
    return True

SystemType

Bases: Enum

The type of the system.

Source code in src/sdk/python/rtdip_sdk/pipelines/_pipeline_utils/models.py
20
21
22
23
24
25
26
27
28
class SystemType(Enum):
    """The type of the system."""

    # Executable in a python environment
    PYTHON = 1
    # Executable in a pyspark environment
    PYSPARK = 2
    # Executable in a databricks environment
    PYSPARK_DATABRICKS = 3

DeltaTableCreateUtility

Bases: UtilitiesInterface

Creates a Delta Table in a Hive Metastore or in Databricks Unity Catalog.

Example

from rtdip_sdk.pipelines.utilities.spark.delta_table_create import DeltaTableCreateUtility, DeltaTableColumn

table_create_utility = DeltaTableCreateUtility(
    spark=spark_session,
    table_name="delta_table",
    columns=[
        DeltaTableColumn(name="EventDate", type="date", nullable=False, metadata={"delta.generationExpression": "CAST(EventTime AS DATE)"}),
        DeltaTableColumn(name="TagName", type="string", nullable=False),
        DeltaTableColumn(name="EventTime", type="timestamp", nullable=False),
        DeltaTableColumn(name="Status", type="string", nullable=True),
        DeltaTableColumn(name="Value", type="float", nullable=True)
    ],
    partitioned_by=["EventDate"],
    properties={"delta.logRetentionDuration": "7 days", "delta.enableChangeDataFeed": "true"},
    comment="Creation of Delta Table"
)

result = table_create_utility.execute()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session required to read data from cloud storage

required
table_name str

Name of the table, including catalog and schema if table is to be created in Unity Catalog

required
columns list[DeltaTableColumn]

List of columns and their related column properties

required
partitioned_by list[str]

List of column names to partition the table by

None
location str

Path to storage location

None
properties dict

Propoerties that can be specified for a Delta Table. Further information on the options available are here

None
comment str

Provides a comment on the table metadata

None
Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/spark/delta_table_create.py
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
class DeltaTableCreateUtility(UtilitiesInterface):
    """
    Creates a Delta Table in a Hive Metastore or in Databricks Unity Catalog.

    Example
    -------
    ```python
    from rtdip_sdk.pipelines.utilities.spark.delta_table_create import DeltaTableCreateUtility, DeltaTableColumn

    table_create_utility = DeltaTableCreateUtility(
        spark=spark_session,
        table_name="delta_table",
        columns=[
            DeltaTableColumn(name="EventDate", type="date", nullable=False, metadata={"delta.generationExpression": "CAST(EventTime AS DATE)"}),
            DeltaTableColumn(name="TagName", type="string", nullable=False),
            DeltaTableColumn(name="EventTime", type="timestamp", nullable=False),
            DeltaTableColumn(name="Status", type="string", nullable=True),
            DeltaTableColumn(name="Value", type="float", nullable=True)
        ],
        partitioned_by=["EventDate"],
        properties={"delta.logRetentionDuration": "7 days", "delta.enableChangeDataFeed": "true"},
        comment="Creation of Delta Table"
    )

    result = table_create_utility.execute()
    ```

    Parameters:
        spark (SparkSession): Spark Session required to read data from cloud storage
        table_name (str): Name of the table, including catalog and schema if table is to be created in Unity Catalog
        columns (list[DeltaTableColumn]): List of columns and their related column properties
        partitioned_by (list[str], optional): List of column names to partition the table by
        location (str, optional): Path to storage location
        properties (dict, optional): Propoerties that can be specified for a Delta Table. Further information on the options available are [here](https://docs.databricks.com/delta/table-properties.html#delta-table-properties)
        comment (str, optional): Provides a comment on the table metadata


    """

    spark: SparkSession
    table_name: str
    columns: List[DeltaTableColumn]
    partitioned_by: List[str]
    location: str
    properties: dict
    comment: str

    def __init__(
        self,
        spark: SparkSession,
        table_name: str,
        columns: List[StructField],
        partitioned_by: List[str] = None,
        location: str = None,
        properties: dict = None,
        comment: str = None,
    ) -> None:
        self.spark = spark
        self.table_name = table_name
        self.columns = columns
        self.partitioned_by = partitioned_by
        self.location = location
        self.properties = properties
        self.comment = comment

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        libraries.add_maven_library(get_default_package("spark_delta_core"))
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def execute(self) -> bool:
        try:
            columns = [StructField.fromJson(column.dict()) for column in self.columns]

            delta_table = (
                DeltaTable.createIfNotExists(self.spark)
                .tableName(self.table_name)
                .addColumns(columns)
            )

            if self.partitioned_by is not None:
                delta_table = delta_table.partitionedBy(self.partitioned_by)

            if self.location is not None:
                delta_table = delta_table.location(self.location)

            if self.properties is not None:
                for key, value in self.properties.items():
                    delta_table = delta_table.property(key, value)

            if self.comment is not None:
                delta_table = delta_table.comment(self.comment)

            delta_table.execute()
            return True

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/spark/delta_table_create.py
100
101
102
103
104
105
106
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

DeltaTableOptimizeUtility

Bases: UtilitiesInterface

Optimizes a Delta Table.

Example

from rtdip_sdk.pipelines.utilities.spark.delta_table_optimize import DeltaTableOptimizeUtility

table_optimize_utility = DeltaTableOptimizeUtility(
    spark=spark_session,
    table_name="delta_table",
    where="EventDate<=current_date()",
    zorder_by=["EventDate"]
)

result = table_optimize_utility.execute()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session required to read data from cloud storage

required
table_name str

Name of the table, including catalog and schema if table is to be created in Unity Catalog

required
where str

Apply a partition filter to limit optimize to specific partitions. Example, "date='2021-11-18'" or "EventDate<=current_date()"

None
zorder_by list[str]

List of column names to zorder the table by. For more information, see here.

None
Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/spark/delta_table_optimize.py
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
class DeltaTableOptimizeUtility(UtilitiesInterface):
    """
    [Optimizes](https://docs.delta.io/latest/optimizations-oss.html) a Delta Table.

    Example
    -------
    ```python
    from rtdip_sdk.pipelines.utilities.spark.delta_table_optimize import DeltaTableOptimizeUtility

    table_optimize_utility = DeltaTableOptimizeUtility(
        spark=spark_session,
        table_name="delta_table",
        where="EventDate<=current_date()",
        zorder_by=["EventDate"]
    )

    result = table_optimize_utility.execute()
    ```

    Parameters:
        spark (SparkSession): Spark Session required to read data from cloud storage
        table_name (str): Name of the table, including catalog and schema if table is to be created in Unity Catalog
        where (str, optional): Apply a partition filter to limit optimize to specific partitions. Example, "date='2021-11-18'" or "EventDate<=current_date()"
        zorder_by (list[str], optional): List of column names to zorder the table by. For more information, see [here.](https://docs.delta.io/latest/optimizations-oss.html#optimize-performance-with-file-management&language-python)
    """

    spark: SparkSession
    table_name: str
    where: Optional[str]
    zorder_by: Optional[List[str]]

    def __init__(
        self,
        spark: SparkSession,
        table_name: str,
        where: str = None,
        zorder_by: List[str] = None,
    ) -> None:
        self.spark = spark
        self.table_name = table_name
        self.where = where
        self.zorder_by = zorder_by

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        libraries.add_maven_library(get_default_package("spark_delta_core"))
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def execute(self) -> bool:
        try:
            delta_table = DeltaTable.forName(self.spark, self.table_name).optimize()

            if self.where is not None:
                delta_table = delta_table.where(self.where)

            if self.zorder_by is not None:
                delta_table = delta_table.executeZOrderBy(self.zorder_by)
            else:
                delta_table.executeCompaction()

            return True

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/spark/delta_table_optimize.py
69
70
71
72
73
74
75
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

DeltaTableVacuumUtility

Bases: UtilitiesInterface

Vacuums a Delta Table.

Example

from rtdip_sdk.pipelines.utilities.spark.delta_table_vacuum import DeltaTableVacuumUtility

table_vacuum_utility =  DeltaTableVacuumUtility(
    spark=spark_session,
    table_name="delta_table",
    retention_hours="168"
)

result = table_vacuum_utility.execute()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session required to read data from cloud storage

required
table_name str

Name of the table, including catalog and schema if table is to be created in Unity Catalog

required
retention_hours int

Sets the retention threshold in hours.

None
Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/spark/delta_table_vacuum.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
class DeltaTableVacuumUtility(UtilitiesInterface):
    """
    [Vacuums](https://docs.delta.io/latest/delta-utility.html#-delta-vacuum) a Delta Table.

    Example
    -------
    ```python
    from rtdip_sdk.pipelines.utilities.spark.delta_table_vacuum import DeltaTableVacuumUtility

    table_vacuum_utility =  DeltaTableVacuumUtility(
        spark=spark_session,
        table_name="delta_table",
        retention_hours="168"
    )

    result = table_vacuum_utility.execute()
    ```

    Parameters:
        spark (SparkSession): Spark Session required to read data from cloud storage
        table_name (str): Name of the table, including catalog and schema if table is to be created in Unity Catalog
        retention_hours (int, optional): Sets the retention threshold in hours.
    """

    spark: SparkSession
    table_name: str
    retention_hours: Optional[int]

    def __init__(
        self, spark: SparkSession, table_name: str, retention_hours: int = None
    ) -> None:
        self.spark = spark
        self.table_name = table_name
        self.retention_hours = retention_hours

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        libraries.add_maven_library(get_default_package("spark_delta_core"))
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def execute(self) -> bool:
        try:
            delta_table = DeltaTable.forName(self.spark, self.table_name)

            delta_table.vacuum(retentionHours=self.retention_hours)

            return True

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/spark/delta_table_vacuum.py
61
62
63
64
65
66
67
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

SparkConfigurationUtility

Bases: UtilitiesInterface

Sets configuration key value pairs to a Spark Session

Example

from rtdip_sdk.pipelines.sources import SparkConfigurationUtility
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

configuration_utility = SparkConfigurationUtility(
    spark=spark,
    config={}
)

result = configuration_utility.execute()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session required to read data from cloud storage

required
config dict

Dictionary of spark configuration to be applied to the spark session

required
Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/spark/configuration.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
class SparkConfigurationUtility(UtilitiesInterface):
    """
    Sets configuration key value pairs to a Spark Session

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.sources import SparkConfigurationUtility
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    configuration_utility = SparkConfigurationUtility(
        spark=spark,
        config={}
    )

    result = configuration_utility.execute()
    ```

    Parameters:
        spark (SparkSession): Spark Session required to read data from cloud storage
        config (dict): Dictionary of spark configuration to be applied to the spark session
    """

    spark: SparkSession
    config: dict
    columns: List[StructField]
    partitioned_by: List[str]
    location: str
    properties: dict
    comment: str

    def __init__(self, spark: SparkSession, config: dict) -> None:
        self.spark = spark
        self.config = config

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def execute(self) -> bool:
        """Executes configuration key value pairs to a Spark Session"""
        try:
            for configuration in self.config.items():
                self.spark.conf.set(configuration[0], configuration[1])
            return True

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/spark/configuration.py
63
64
65
66
67
68
69
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

execute()

Executes configuration key value pairs to a Spark Session

Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/spark/configuration.py
80
81
82
83
84
85
86
87
88
89
90
91
92
def execute(self) -> bool:
    """Executes configuration key value pairs to a Spark Session"""
    try:
        for configuration in self.config.items():
            self.spark.conf.set(configuration[0], configuration[1])
        return True

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

SparkADLSGen2SPNConnectUtility

Bases: UtilitiesInterface

Configures Spark to Connect to an ADLS Gen 2 Storage Account using a Service Principal.

Example

from rtdip_sdk.pipelines.utilities import SparkADLSGen2SPNConnectUtility
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

adls_gen2_connect_utility = SparkADLSGen2SPNConnectUtility(
    spark=spark,
    storage_account="YOUR-STORAGAE-ACCOUNT-NAME",
    tenant_id="YOUR-TENANT-ID",
    client_id="YOUR-CLIENT-ID",
    client_secret="YOUR-CLIENT-SECRET"
)

result = adls_gen2_connect_utility.execute()

Parameters:

Name Type Description Default
spark SparkSession

Spark Session required to read data from cloud storage

required
storage_account str

Name of the ADLS Gen 2 Storage Account

required
tenant_id str

Tenant ID of the Service Principal

required
client_id str

Service Principal Client ID

required
client_secret str

Service Principal Client Secret

required
Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/spark/adls_gen2_spn_connect.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
class SparkADLSGen2SPNConnectUtility(UtilitiesInterface):
    """
    Configures Spark to Connect to an ADLS Gen 2 Storage Account using a Service Principal.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.utilities import SparkADLSGen2SPNConnectUtility
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    adls_gen2_connect_utility = SparkADLSGen2SPNConnectUtility(
        spark=spark,
        storage_account="YOUR-STORAGAE-ACCOUNT-NAME",
        tenant_id="YOUR-TENANT-ID",
        client_id="YOUR-CLIENT-ID",
        client_secret="YOUR-CLIENT-SECRET"
    )

    result = adls_gen2_connect_utility.execute()
    ```

    Parameters:
        spark (SparkSession): Spark Session required to read data from cloud storage
        storage_account (str): Name of the ADLS Gen 2 Storage Account
        tenant_id (str): Tenant ID of the Service Principal
        client_id (str): Service Principal Client ID
        client_secret (str): Service Principal Client Secret
    """

    spark: SparkSession
    storage_account: str
    tenant_id: str
    client_id: str
    client_secret: str

    def __init__(
        self,
        spark: SparkSession,
        storage_account: str,
        tenant_id: str,
        client_id: str,
        client_secret: str,
    ) -> None:
        self.spark = spark
        self.storage_account = storage_account
        self.tenant_id = tenant_id
        self.client_id = client_id
        self.client_secret = client_secret

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def execute(self) -> bool:
        """Executes spark configuration to connect to an ADLS Gen 2 Storage Account using a service principal"""
        try:
            adls_gen2_config = SparkConfigurationUtility(
                spark=self.spark,
                config={
                    "fs.azure.account.auth.type.{}.dfs.core.windows.net".format(
                        self.storage_account
                    ): "OAuth",
                    "fs.azure.account.oauth.provider.type.{}.dfs.core.windows.net".format(
                        self.storage_account
                    ): "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
                    "fs.azure.account.oauth2.client.id.{}.dfs.core.windows.net".format(
                        self.storage_account
                    ): self.client_id,
                    "fs.azure.account.oauth2.client.secret.{}.dfs.core.windows.net".format(
                        self.storage_account
                    ): self.client_secret,
                    "fs.azure.account.oauth2.client.endpoint.{}.dfs.core.windows.net".format(
                        self.storage_account
                    ): "https://login.microsoftonline.com/{}/oauth2/token".format(
                        self.tenant_id
                    ),
                },
            )
            adls_gen2_config.execute()
            return True

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/spark/adls_gen2_spn_connect.py
76
77
78
79
80
81
82
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

execute()

Executes spark configuration to connect to an ADLS Gen 2 Storage Account using a service principal

Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/spark/adls_gen2_spn_connect.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def execute(self) -> bool:
    """Executes spark configuration to connect to an ADLS Gen 2 Storage Account using a service principal"""
    try:
        adls_gen2_config = SparkConfigurationUtility(
            spark=self.spark,
            config={
                "fs.azure.account.auth.type.{}.dfs.core.windows.net".format(
                    self.storage_account
                ): "OAuth",
                "fs.azure.account.oauth.provider.type.{}.dfs.core.windows.net".format(
                    self.storage_account
                ): "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
                "fs.azure.account.oauth2.client.id.{}.dfs.core.windows.net".format(
                    self.storage_account
                ): self.client_id,
                "fs.azure.account.oauth2.client.secret.{}.dfs.core.windows.net".format(
                    self.storage_account
                ): self.client_secret,
                "fs.azure.account.oauth2.client.endpoint.{}.dfs.core.windows.net".format(
                    self.storage_account
                ): "https://login.microsoftonline.com/{}/oauth2/token".format(
                    self.tenant_id
                ),
            },
        )
        adls_gen2_config.execute()
        return True

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

ADLSGen2DirectoryACLUtility

Bases: UtilitiesInterface

Assigns Azure AD Groups to ACLs on directories in an Azure Data Lake Store Gen 2 storage account.

Example

from rtdip_sdk.pipelines.utilities import ADLSGen2DirectoryACLUtility

adls_gen2_directory_acl_utility = ADLSGen2DirectoryACLUtility(
    storage_account="YOUR-STORAGAE-ACCOUNT-NAME",
    container="YOUR-ADLS_CONTAINER_NAME",
    credential="YOUR-TOKEN-CREDENTIAL",
    directory="DIRECTORY",
    group_object_id="GROUP-OBJECT",
    folder_permissions="r-x",
    parent_folder_permissions="r-x",
    root_folder_permissions="r-x",
    set_as_default_acl=True,
    create_directory_if_not_exists=True
)

result = adls_gen2_directory_acl_utility.execute()

Parameters:

Name Type Description Default
storage_account str

ADLS Gen 2 Storage Account Name

required
container str

ADLS Gen 2 Container Name

required
credential TokenCredential

Credentials to authenticate with ADLS Gen 2 Storage Account

required
directory str

Directory to be assign ACLS to in an ADLS Gen 2

required
group_object_id str

Azure AD Group Object ID to be assigned to Directory

required
folder_permissions (optional, str)

Folder Permissions to Assign to directory

'r-x'
parent_folder_permissions (optional, str)

Folder Permissions to Assign to parent directories. Parent Folder ACLs not set if None

'r-x'
root_folder_permissions (optional, str)

Folder Permissions to Assign to root directory. Root Folder ACL not set if None

'r-x'
set_as_default_acl bool

Sets the ACL as the default ACL on the folder

True
create_directory_if_not_exists bool

Creates the directory(and Parent Directories) if it does not exist

True
Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/azure/adls_gen2_acl.py
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
class ADLSGen2DirectoryACLUtility(UtilitiesInterface):
    """
    Assigns Azure AD Groups to ACLs on directories in an Azure Data Lake Store Gen 2 storage account.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.utilities import ADLSGen2DirectoryACLUtility

    adls_gen2_directory_acl_utility = ADLSGen2DirectoryACLUtility(
        storage_account="YOUR-STORAGAE-ACCOUNT-NAME",
        container="YOUR-ADLS_CONTAINER_NAME",
        credential="YOUR-TOKEN-CREDENTIAL",
        directory="DIRECTORY",
        group_object_id="GROUP-OBJECT",
        folder_permissions="r-x",
        parent_folder_permissions="r-x",
        root_folder_permissions="r-x",
        set_as_default_acl=True,
        create_directory_if_not_exists=True
    )

    result = adls_gen2_directory_acl_utility.execute()
    ```

    Parameters:
        storage_account (str): ADLS Gen 2 Storage Account Name
        container (str): ADLS Gen 2 Container Name
        credential (TokenCredential): Credentials to authenticate with ADLS Gen 2 Storage Account
        directory (str): Directory to be assign ACLS to in an ADLS Gen 2
        group_object_id (str): Azure AD Group Object ID to be assigned to Directory
        folder_permissions (optional, str): Folder Permissions to Assign to directory
        parent_folder_permissions (optional, str): Folder Permissions to Assign to parent directories. Parent Folder ACLs not set if None
        root_folder_permissions (optional, str): Folder Permissions to Assign to root directory. Root Folder ACL not set if None
        set_as_default_acl (bool, optional): Sets the ACL as the default ACL on the folder
        create_directory_if_not_exists (bool, optional): Creates the directory(and Parent Directories) if it does not exist
    """

    storage_account: str
    container: str
    credential: Union[
        str,
        Dict[str, str],
        AzureNamedKeyCredential,
        AzureSasCredential,
        TokenCredential,
        None,
    ]
    directory: str
    group_object_id: str
    folder_permissions: str
    parent_folder_permissions: str
    root_folder_permissions: str
    set_as_default_acl: bool
    create_directory_if_not_exists: bool

    def __init__(
        self,
        storage_account: str,
        container: str,
        credential: Union[
            str,
            Dict[str, str],
            AzureNamedKeyCredential,
            AzureSasCredential,
            TokenCredential,
            None,
        ],
        directory: str,
        group_object_id: str,
        folder_permissions: str = "r-x",
        parent_folder_permissions: Union[str, None] = "r-x",
        root_folder_permissions: Union[str, None] = "r-x",
        set_as_default_acl: bool = True,
        create_directory_if_not_exists: bool = True,
    ) -> None:
        self.storage_account = storage_account
        self.container = container
        self.credential = credential
        self.directory = directory
        self.group_object_id = group_object_id
        self.folder_permissions = folder_permissions
        self.parent_folder_permissions = parent_folder_permissions
        self.root_folder_permissions = root_folder_permissions
        self.set_as_default_acl = set_as_default_acl
        self.create_directory_if_not_exists = create_directory_if_not_exists

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYTHON
        """
        return SystemType.PYTHON

    @staticmethod
    def libraries():
        libraries = Libraries()
        libraries.add_pypi_library(get_default_package("azure_adls_gen_2"))
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def _set_acl(
        self,
        file_system_client: FileSystemClient,
        path: str,
        group_object_id: str,
        folder_permissions: str,
        set_as_default_acl: bool,
    ):
        acl_directory_client = file_system_client.get_directory_client(path)

        group_id_acl = "group:{}:{}".format(group_object_id, folder_permissions)
        acl_props = acl_directory_client.get_access_control().get("acl")
        acl_props_list = acl_props.split(",")

        for acl in acl_props_list:
            if group_object_id in acl:
                acl_props_list.remove(acl)

        acl_props_list.append(group_id_acl)
        if set_as_default_acl == True:
            acl_props_list.append("default:{}".format(group_id_acl))

        new_acl_props = ",".join(acl_props_list)
        acl_directory_client.set_access_control(acl=new_acl_props)

    def execute(self) -> bool:
        try:
            # Setup file system client
            service_client = DataLakeServiceClient(
                account_url="{}://{}.dfs.core.windows.net".format(
                    "https", self.storage_account
                ),
                credential=self.credential,
            )
            file_system_client = service_client.get_file_system_client(
                file_system=self.container
            )

            # Create directory if it doesn't already exist
            if self.create_directory_if_not_exists:
                directory_client = file_system_client.get_directory_client(
                    self.directory
                )
                if not directory_client.exists():
                    file_system_client.create_directory(self.directory)

            group_object_id = str(self.group_object_id)
            acl_path = ""
            directory_list = self.directory.split("/")

            # Set Root Folder ACLs if specified
            if self.root_folder_permissions != None:
                self._set_acl(
                    file_system_client,
                    "/",
                    group_object_id,
                    self.root_folder_permissions,
                    False,
                )

            # Set Parent Folders ACLs if specified
            if self.parent_folder_permissions != None:
                for directory in directory_list[:-1]:
                    if directory == "":
                        acl_path = "/"
                        continue
                    elif acl_path == "/":
                        acl_path += directory
                    else:
                        acl_path += "/" + directory

                    self._set_acl(
                        file_system_client,
                        acl_path,
                        group_object_id,
                        self.parent_folder_permissions,
                        False,
                    )

            # Set Folder ACLs
            self._set_acl(
                file_system_client,
                self.directory,
                group_object_id,
                self.folder_permissions,
                self.set_as_default_acl,
            )

            return True

        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYTHON

Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/azure/adls_gen2_acl.py
117
118
119
120
121
122
123
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYTHON
    """
    return SystemType.PYTHON

AzureAutoloaderResourcesUtility

Bases: UtilitiesInterface

Creates the required Azure Resources for the Databricks Autoloader Notification Mode.

Example

from rtdip_sdk.pipelines.utilities import AzureAutoloaderResourcesUtility

azure_autoloader_resources_utility = AzureAutoloaderResourcesUtility(
    subscription_id="YOUR-SUBSCRIPTION-ID",
    resource_group_name="YOUR-RESOURCE-GROUP",
    storage_account="YOUR-STORAGE-ACCOUNT-NAME",
    container="YOUR-CONTAINER-NAME",
    directory="DIRECTORY",
    credential="YOUR-CLIENT-ID",
    event_subscription_name="YOUR-EVENT-SUBSCRIPTION",
    queue_name="YOUR-QUEUE-NAME",
    system_topic_name=None
)

result = azure_autoloader_resources_utility.execute()

Parameters:

Name Type Description Default
subscription_id str

Azure Subscription ID

required
resource_group_name str

Resource Group Name of Subscription

required
storage_account str

Storage Account Name

required
container str

Container Name

required
directory str

Directory to be used for filtering messages in the Event Subscription. This will be equivalent to the Databricks Autoloader Path

required
credential TokenCredential

Credentials to authenticate with Storage Account

required
event_subscription_name str

Name of the Event Subscription

required
queue_name str

Name of the queue that will be used for the Endpoint of the Messages

required
Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/azure/autoloader_resources.py
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
class AzureAutoloaderResourcesUtility(UtilitiesInterface):
    """
    Creates the required Azure Resources for the Databricks Autoloader Notification Mode.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.utilities import AzureAutoloaderResourcesUtility

    azure_autoloader_resources_utility = AzureAutoloaderResourcesUtility(
        subscription_id="YOUR-SUBSCRIPTION-ID",
        resource_group_name="YOUR-RESOURCE-GROUP",
        storage_account="YOUR-STORAGE-ACCOUNT-NAME",
        container="YOUR-CONTAINER-NAME",
        directory="DIRECTORY",
        credential="YOUR-CLIENT-ID",
        event_subscription_name="YOUR-EVENT-SUBSCRIPTION",
        queue_name="YOUR-QUEUE-NAME",
        system_topic_name=None
    )

    result = azure_autoloader_resources_utility.execute()
    ```

    Parameters:
        subscription_id (str): Azure Subscription ID
        resource_group_name (str): Resource Group Name of Subscription
        storage_account (str): Storage Account Name
        container (str): Container Name
        directory (str): Directory to be used for filtering messages in the Event Subscription. This will be equivalent to the Databricks Autoloader Path
        credential (TokenCredential): Credentials to authenticate with Storage Account
        event_subscription_name (str): Name of the Event Subscription
        queue_name (str): Name of the queue that will be used for the Endpoint of the Messages
    """

    subscription_id: str
    resource_group_name: str
    storage_account: str
    container: str
    directory: str
    credential: TokenCredential
    event_subscription_name: str
    queue_name: str

    def __init__(
        self,
        subscription_id: str,
        resource_group_name: str,
        storage_account: str,
        container: str,
        directory: str,
        credential: TokenCredential,
        event_subscription_name: str,
        queue_name: str,
    ) -> None:
        self.subscription_id = subscription_id
        self.resource_group_name = resource_group_name
        self.storage_account = storage_account
        self.container = container
        self.directory = directory
        self.credential = credential
        self.event_subscription_name = event_subscription_name
        self.queue_name = queue_name

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYTHON
        """
        return SystemType.PYTHON

    @staticmethod
    def libraries():
        libraries = Libraries()
        libraries.add_pypi_library(get_default_package("azure_eventgrid_mgmt"))
        libraries.add_pypi_library(get_default_package("azure_storage_mgmt"))
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def execute(self) -> bool:
        storage_mgmt_client = StorageManagementClient(
            credential=self.credential, subscription_id=self.subscription_id
        )

        try:
            queue_response = storage_mgmt_client.queue.get(
                resource_group_name=self.resource_group_name,
                account_name=self.storage_account,
                queue_name=self.queue_name,
            )
        except ResourceNotFoundError:
            queue_response = None

        if queue_response == None:
            storage_mgmt_client.queue.create(
                resource_group_name=self.resource_group_name,
                account_name=self.storage_account,
                queue_name=self.queue_name,
                queue=StorageQueue(),
            )

        eventgrid_client = EventGridManagementClient(
            credential=self.credential, subscription_id=self.subscription_id
        )

        source = "/subscriptions/{}/resourceGroups/{}/providers/Microsoft.Storage/StorageAccounts/{}".format(
            self.subscription_id, self.resource_group_name, self.storage_account
        )

        try:
            event_subscription_response = eventgrid_client.event_subscriptions.get(
                scope=source, event_subscription_name=self.event_subscription_name
            )
        except ResourceNotFoundError:
            event_subscription_response = None

        if event_subscription_response == None:
            event_subscription_destination = StorageQueueEventSubscriptionDestination(
                resource_id=source,
                queue_name=self.queue_name,
                queue_message_time_to_live_in_seconds=None,
            )

            event_subscription_filter = EventSubscriptionFilter(
                subject_begins_with="/blobServices/default/containers/{}/blobs/{}".format(
                    self.container, self.directory
                ),
                included_event_types=[
                    "Microsoft.Storage.BlobCreated",
                    "Microsoft.Storage.BlobRenamed",
                    "Microsoft.Storage.DirectoryRenamed",
                ],
                advanced_filters=[
                    StringContainsAdvancedFilter(
                        key="data.api",
                        values=[
                            "CopyBlob",
                            "PutBlob",
                            "PutBlockList",
                            "FlushWithClose",
                            "RenameFile",
                            "RenameDirectory",
                        ],
                    )
                ],
            )

            retry_policy = RetryPolicy()

            event_subscription_info = EventSubscription(
                destination=event_subscription_destination,
                filter=event_subscription_filter,
                event_delivery_schema=EventDeliverySchema.EVENT_GRID_SCHEMA,
                retry_policy=retry_policy,
            )

            eventgrid_client.event_subscriptions.begin_create_or_update(
                scope=source,
                event_subscription_name=self.event_subscription_name,
                event_subscription_info=event_subscription_info,
            ).result()

            return True

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYTHON

Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/azure/autoloader_resources.py
 98
 99
100
101
102
103
104
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYTHON
    """
    return SystemType.PYTHON

PipelineComponentsGetUtility

Bases: UtilitiesInterface

Gets the list of imported RTDIP components. Returns the libraries and settings of the components to be used in the pipeline.

Call this component after all imports of the RTDIP components to ensure that the components can be determined.

Parameters:

Name Type Description Default
module optional str

Provide the module to use for imports of rtdip-sdk components. If not populated, it will use the calling module to check for imports

None
spark_config optional dict

Additional spark configuration to be applied to the spark session

None
Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/pipeline_components.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
class PipelineComponentsGetUtility(UtilitiesInterface):
    """
    Gets the list of imported RTDIP components. Returns the libraries and settings of the components to be used in the pipeline.

    Call this component after all imports of the RTDIP components to ensure that the components can be determined.

    Parameters:
        module (optional str): Provide the module to use for imports of rtdip-sdk components. If not populated, it will use the calling module to check for imports
        spark_config (optional dict): Additional spark configuration to be applied to the spark session
    """

    def __init__(self, module: str = None, spark_config: dict = None) -> None:
        if module == None:
            frm = inspect.stack()[1]
            mod = inspect.getmodule(frm[0])
            self.module = mod.__name__
        else:
            self.module = module
        self.spark_config = {} if spark_config is None else spark_config

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYTHON
        """
        return SystemType.PYTHON

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def execute(self) -> Tuple[Libraries, dict]:
        from ..sources.interfaces import SourceInterface
        from ..destinations.interfaces import DestinationInterface
        from ..deploy.interfaces import DeployInterface
        from ..secrets.interfaces import SecretsInterface
        from ..transformers.interfaces import TransformerInterface

        try:
            classes_imported = inspect.getmembers(
                sys.modules[self.module], inspect.isclass
            )
            component_list = []
            for cls in classes_imported:
                class_check = getattr(sys.modules[self.module], cls[0])
                if (
                    (
                        issubclass(class_check, SourceInterface)
                        and class_check != SourceInterface
                    )
                    or (
                        issubclass(class_check, DestinationInterface)
                        and class_check != DestinationInterface
                    )
                    or (
                        issubclass(class_check, DeployInterface)
                        and class_check != DeployInterface
                    )
                    or (
                        issubclass(class_check, SecretsInterface)
                        and class_check != SecretsInterface
                    )
                    or (
                        issubclass(class_check, TransformerInterface)
                        and class_check != TransformerInterface
                    )
                    or (
                        issubclass(class_check, UtilitiesInterface)
                        and class_check != UtilitiesInterface
                    )
                ):
                    component_list.append(cls[1])

            task_libraries = Libraries()
            task_libraries.get_libraries_from_components(component_list)
            spark_configuration = self.spark_config
            for component in component_list:
                spark_configuration = {**spark_configuration, **component.settings()}
            return (task_libraries, spark_configuration)

        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYTHON

Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/pipeline_components.py
44
45
46
47
48
49
50
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYTHON
    """
    return SystemType.PYTHON

SparkSessionUtility

Bases: UtilitiesInterface

Creates or Gets a Spark Session and uses settings and libraries of the imported RTDIP components to populate the spark configuration and jars in the spark session.

Call this component after all imports of the RTDIP components to ensure that the spark session is configured correctly.

Example

from rtdip_sdk.pipelines.utilities import SparkSessionUtility

spark_session_utility = SparkSessionUtility(
    config={},
    module=None,
    remote=None
)

result = spark_session_utility.execute()

Parameters:

Name Type Description Default
config optional dict

Dictionary of spark configuration to be applied to the spark session

None
module optional str

Provide the module to use for imports of rtdip-sdk components. If not populated, it will use the calling module to check for imports

None
remote optional str

Specify the remote parameters if intending to use Spark Connect

None
Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
class SparkSessionUtility(UtilitiesInterface):
    """
    Creates or Gets a Spark Session and uses settings and libraries of the imported RTDIP components to populate the spark configuration and jars in the spark session.

    Call this component after all imports of the RTDIP components to ensure that the spark session is configured correctly.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility

    spark_session_utility = SparkSessionUtility(
        config={},
        module=None,
        remote=None
    )

    result = spark_session_utility.execute()
    ```

    Parameters:
        config (optional dict): Dictionary of spark configuration to be applied to the spark session
        module (optional str): Provide the module to use for imports of rtdip-sdk components. If not populated, it will use the calling module to check for imports
        remote (optional str): Specify the remote parameters if intending to use Spark Connect
    """

    spark: SparkSession
    config: dict
    module: str

    def __init__(
        self, config: dict = None, module: str = None, remote: str = None
    ) -> None:
        self.config = config
        if module == None:
            frm = inspect.stack()[1]
            mod = inspect.getmodule(frm[0])
            self.module = mod.__name__
        else:
            self.module = module
        self.remote = remote

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def execute(self) -> SparkSession:
        """To execute"""
        try:
            (task_libraries, spark_configuration) = PipelineComponentsGetUtility(
                self.module, self.config
            ).execute()
            self.spark = SparkClient(
                spark_configuration=spark_configuration,
                spark_libraries=task_libraries,
                spark_remote=self.remote,
            ).spark_session
            return self.spark

        except Exception as e:
            logging.exception(str(e))
            raise e

system_type() staticmethod

Attributes:

Name Type Description
SystemType Environment

Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py
69
70
71
72
73
74
75
@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

execute()

To execute

Source code in src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def execute(self) -> SparkSession:
    """To execute"""
    try:
        (task_libraries, spark_configuration) = PipelineComponentsGetUtility(
            self.module, self.config
        ).execute()
        self.spark = SparkClient(
            spark_configuration=spark_configuration,
            spark_libraries=task_libraries,
            spark_remote=self.remote,
        ).spark_session
        return self.spark

    except Exception as e:
        logging.exception(str(e))
        raise e

PipelineJobFromJsonConverter

Bases: ConverterInterface

Converts a json string into a Pipeline Job.

Example

from rtdip_sdk.pipelines.secrets import PipelineJobFromJsonConverter

convert_json_string_to_pipline_job = PipelineJobFromJsonConverter(
    pipeline_json = "{JSON-STRING}"
)

convert_json_string_to_pipline_job.convert()

Parameters:

Name Type Description Default
pipeline_json str

Json representing PipelineJob information, including tasks and related steps

required
Source code in src/sdk/python/rtdip_sdk/pipelines/converters/pipeline_job_json.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
class PipelineJobFromJsonConverter(ConverterInterface):
    """
    Converts a json string into a Pipeline Job.

    Example
    -------
    ```python
    from rtdip_sdk.pipelines.secrets import PipelineJobFromJsonConverter

    convert_json_string_to_pipline_job = PipelineJobFromJsonConverter(
        pipeline_json = "{JSON-STRING}"
    )

    convert_json_string_to_pipline_job.convert()
    ```

    Parameters:
        pipeline_json (str): Json representing PipelineJob information, including tasks and related steps
    """

    pipeline_json: str

    def __init__(self, pipeline_json: str):
        self.pipeline_json = pipeline_json

    def _try_convert_to_pipeline_secret(self, value):
        try:
            if "pipeline_secret" in value:
                value["pipeline_secret"]["type"] = getattr(
                    sys.modules[__name__], value["pipeline_secret"]["type"]
                )
            return PipelineSecret.parse_obj(value["pipeline_secret"])
        except:  # NOSONAR
            return value

    def convert(self) -> PipelineJob:
        """
        Converts a json string to a Pipeline Job
        """
        pipeline_job_dict = json.loads(self.pipeline_json)

        # convert string component to class
        for task in pipeline_job_dict["task_list"]:
            for step in task["step_list"]:
                step["component"] = getattr(sys.modules[__name__], step["component"])
                for param_key, param_value in step["component_parameters"].items():
                    step["component_parameters"][param_key] = (
                        self._try_convert_to_pipeline_secret(param_value)
                    )
                    if not isinstance(
                        step["component_parameters"][param_key], PipelineSecret
                    ) and isinstance(param_value, dict):
                        for key, value in param_value.items():
                            step["component_parameters"][param_key][key] = (
                                self._try_convert_to_pipeline_secret(value)
                            )

        return PipelineJob(**pipeline_job_dict)

convert()

Converts a json string to a Pipeline Job

Source code in src/sdk/python/rtdip_sdk/pipelines/converters/pipeline_job_json.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def convert(self) -> PipelineJob:
    """
    Converts a json string to a Pipeline Job
    """
    pipeline_job_dict = json.loads(self.pipeline_json)

    # convert string component to class
    for task in pipeline_job_dict["task_list"]:
        for step in task["step_list"]:
            step["component"] = getattr(sys.modules[__name__], step["component"])
            for param_key, param_value in step["component_parameters"].items():
                step["component_parameters"][param_key] = (
                    self._try_convert_to_pipeline_secret(param_value)
                )
                if not isinstance(
                    step["component_parameters"][param_key], PipelineSecret
                ) and isinstance(param_value, dict):
                    for key, value in param_value.items():
                        step["component_parameters"][param_key][key] = (
                            self._try_convert_to_pipeline_secret(value)
                        )

    return PipelineJob(**pipeline_job_dict)

PipelineJobToJsonConverter

Bases: ConverterInterface

Converts a Pipeline Job into a json string.

Example

from rtdip_sdk.pipelines.secrets import PipelineJobToJsonConverter

convert_pipeline_job_to_json_string = PipelineJobFromJsonConverter(
    pipeline_json = PipelineJob
)

convert_pipeline_job_to_json_string.convert()

Parameters:

Name Type Description Default
pipeline_job PipelineJob

A Pipeline Job consisting of tasks and steps

required
Source code in src/sdk/python/rtdip_sdk/pipelines/converters/pipeline_job_json.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
class PipelineJobToJsonConverter(ConverterInterface):
    """
    Converts a Pipeline Job into a json string.

    Example
    -------
    ```python
    from rtdip_sdk.pipelines.secrets import PipelineJobToJsonConverter

    convert_pipeline_job_to_json_string = PipelineJobFromJsonConverter(
        pipeline_json = PipelineJob
    )

    convert_pipeline_job_to_json_string.convert()
    ```

    Parameters:
        pipeline_job (PipelineJob): A Pipeline Job consisting of tasks and steps
    """

    pipeline_job: PipelineJob

    def __init__(self, pipeline_job: PipelineJob):
        self.pipeline_job = pipeline_job

    def convert(self):
        """
        Converts a Pipeline Job to a json string
        """
        # required because pydantic does not use encoders in subclasses
        for task in self.pipeline_job.task_list:
            step_dict_list = []
            for step in task.step_list:
                step_dict_list.append(
                    json.loads(step.json(models_as_dict=False, exclude_none=True))
                )
            task.step_list = step_dict_list

        pipeline_job_json = self.pipeline_job.json(exclude_none=True)
        return pipeline_job_json

convert()

Converts a Pipeline Job to a json string

Source code in src/sdk/python/rtdip_sdk/pipelines/converters/pipeline_job_json.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def convert(self):
    """
    Converts a Pipeline Job to a json string
    """
    # required because pydantic does not use encoders in subclasses
    for task in self.pipeline_job.task_list:
        step_dict_list = []
        for step in task.step_list:
            step_dict_list.append(
                json.loads(step.json(models_as_dict=False, exclude_none=True))
            )
        task.step_list = step_dict_list

    pipeline_job_json = self.pipeline_job.json(exclude_none=True)
    return pipeline_job_json