Read from an IoT Hub

`SparkIoThubSource`

Bases: SourceInterface

This Spark source class is used to read batch or streaming data from an IoT Hub. IoT Hub configurations need to be specified as options in a dictionary. Additionally, there are more optional configurations which can be found here. If using startingPosition or endingPosition make sure to check out the Event Position section for more details and examples.

Example

#IoT Hub Source for Streaming Queries

from rtdip_sdk.pipelines.sources import SparkIoThubSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility
import json

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

connectionString = "Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}"

startingEventPosition = {
"offset": -1,
"seqNo": -1,
"enqueuedTime": None,
"isInclusive": True
}

iot_hub_source = SparkIoThubSource(
    spark=spark,
    options = {
        "eventhubs.connectionString": connectionString,
        "eventhubs.consumerGroup": "{YOUR-CONSUMER-GROUP}",
        "eventhubs.startingPosition": json.dumps(startingEventPosition),
        "maxEventsPerTrigger" : 1000
    }
)

iot_hub_source.read_stream()

 #IoT Hub Source for Batch Queries

from rtdip_sdk.pipelines.sources import SparkIoThubSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility
import json

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

connectionString = "Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}"

startingEventPosition = {
    "offset": -1,
    "seqNo": -1,
    "enqueuedTime": None,
    "isInclusive": True
}

endingEventPosition = {
    "offset": None,
    "seqNo": -1,
    "enqueuedTime": endTime,
    "isInclusive": True
}

iot_hub_source = SparkIoThubSource(
    spark,
    options = {
        "eventhubs.connectionString": connectionString,
        "eventhubs.consumerGroup": "{YOUR-CONSUMER-GROUP}",
        "eventhubs.startingPosition": json.dumps(startingEventPosition),
        "eventhubs.endingPosition": json.dumps(endingEventPosition)
    }
)

iot_hub_source.read_batch()

Parameters:

Name	Type	Description	Default
`spark`	`SparkSession`	Spark Session	required
`options`	`dict`	A dictionary of IoT Hub configurations (See Attributes table below)	required

Attributes:

Name	Type	Description
`eventhubs.connectionString`	`str`	IoT Hub connection string is required to connect to the Eventhubs service. (Streaming and Batch)
`eventhubs.consumerGroup`	`str`	A consumer group is a view of an entire IoT Hub. Consumer groups enable multiple consuming applications to each have a separate view of the event stream, and to read the stream independently at their own pace and with their own offsets. (Streaming and Batch)
`eventhubs.startingPosition`	`JSON str`	The starting position for your Structured Streaming job. If a specific EventPosition is not set for a partition using startingPositions, then we use the EventPosition set in startingPosition. If nothing is set in either option, we will begin consuming from the end of the partition. (Streaming and Batch)
`eventhubs.endingPosition`	`JSON str`	(JSON str): The ending position of a batch query. This works the same as startingPosition. (Batch)
`maxEventsPerTrigger`	`long`	Rate limit on maximum number of events processed per trigger interval. The specified total number of events will be proportionally split across partitions of different volume. (Stream)

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iot_hub.py

class SparkIoThubSource(SourceInterface):
    """
    This Spark source class is used to read batch or streaming data from an IoT Hub. IoT Hub configurations need to be specified as options in a dictionary.
    Additionally, there are more optional configurations which can be found [here.](https://github.com/Azure/azure-event-hubs-spark/blob/master/docs/PySpark/structured-streaming-pyspark.md#event-hubs-configuration){ target="_blank" }
    If using startingPosition or endingPosition make sure to check out the **Event Position** section for more details and examples.

    Example
    --------
    ```python
    #IoT Hub Source for Streaming Queries

    from rtdip_sdk.pipelines.sources import SparkIoThubSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility
    import json

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    connectionString = "Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}"

    startingEventPosition = {
    "offset": -1,
    "seqNo": -1,
    "enqueuedTime": None,
    "isInclusive": True
    }

    iot_hub_source = SparkIoThubSource(
        spark=spark,
        options = {
            "eventhubs.connectionString": connectionString,
            "eventhubs.consumerGroup": "{YOUR-CONSUMER-GROUP}",
            "eventhubs.startingPosition": json.dumps(startingEventPosition),
            "maxEventsPerTrigger" : 1000
        }
    )

    iot_hub_source.read_stream()
    ```
    ```python
     #IoT Hub Source for Batch Queries

    from rtdip_sdk.pipelines.sources import SparkIoThubSource
    from rtdip_sdk.pipelines.utilities import SparkSessionUtility
    import json

    # Not required if using Databricks
    spark = SparkSessionUtility(config={}).execute()

    connectionString = "Endpoint=sb://{NAMESPACE}.servicebus.windows.net/;SharedAccessKeyName={ACCESS_KEY_NAME};SharedAccessKey={ACCESS_KEY}=;EntityPath={EVENT_HUB_NAME}"

    startingEventPosition = {
        "offset": -1,
        "seqNo": -1,
        "enqueuedTime": None,
        "isInclusive": True
    }

    endingEventPosition = {
        "offset": None,
        "seqNo": -1,
        "enqueuedTime": endTime,
        "isInclusive": True
    }

    iot_hub_source = SparkIoThubSource(
        spark,
        options = {
            "eventhubs.connectionString": connectionString,
            "eventhubs.consumerGroup": "{YOUR-CONSUMER-GROUP}",
            "eventhubs.startingPosition": json.dumps(startingEventPosition),
            "eventhubs.endingPosition": json.dumps(endingEventPosition)
        }
    )

    iot_hub_source.read_batch()
    ```

    Parameters:
        spark (SparkSession): Spark Session
        options (dict): A dictionary of IoT Hub configurations (See Attributes table below)

    Attributes:
        eventhubs.connectionString (str):  IoT Hub connection string is required to connect to the Eventhubs service. (Streaming and Batch)
        eventhubs.consumerGroup (str): A consumer group is a view of an entire IoT Hub. Consumer groups enable multiple consuming applications to each have a separate view of the event stream, and to read the stream independently at their own pace and with their own offsets. (Streaming and Batch)
        eventhubs.startingPosition (JSON str): The starting position for your Structured Streaming job. If a specific EventPosition is not set for a partition using startingPositions, then we use the EventPosition set in startingPosition. If nothing is set in either option, we will begin consuming from the end of the partition. (Streaming and Batch)
        eventhubs.endingPosition: (JSON str): The ending position of a batch query. This works the same as startingPosition. (Batch)
        maxEventsPerTrigger (long): Rate limit on maximum number of events processed per trigger interval. The specified total number of events will be proportionally split across partitions of different volume. (Stream)

    """

    options: dict
    spark: SparkSession

    def __init__(self, spark: SparkSession, options: dict) -> None:
        self.spark = spark
        self.schema = EVENTHUB_SCHEMA
        self.options = options

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def settings() -> dict:
        return {}

    @staticmethod
    def libraries():
        spark_libraries = Libraries()
        spark_libraries.add_maven_library(get_default_package("spark_azure_eventhub"))
        return spark_libraries

    def pre_read_validation(self) -> bool:
        return True

    def post_read_validation(self, df: DataFrame) -> bool:
        assert df.schema == self.schema
        return True

    def read_batch(self) -> DataFrame:
        """
        Reads batch data from IoT Hubs.
        """
        iothub_connection_string = "eventhubs.connectionString"
        try:
            if iothub_connection_string in self.options:
                sc = self.spark.sparkContext
                self.options[iothub_connection_string] = (
                    sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(
                        self.options[iothub_connection_string]
                    )
                )

            return self.spark.read.format("eventhubs").options(**self.options).load()

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

    def read_stream(self) -> DataFrame:
        """
        Reads streaming data from IoT Hubs.
        """
        iothub_connection_string = "eventhubs.connectionString"
        try:
            if iothub_connection_string in self.options:
                sc = self.spark.sparkContext
                self.options[iothub_connection_string] = (
                    sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(
                        self.options[iothub_connection_string]
                    )
                )

            return (
                self.spark.readStream.format("eventhubs").options(**self.options).load()
            )

        except Py4JJavaError as e:
            logging.exception(e.errmsg)
            raise e
        except Exception as e:
            logging.exception(str(e))
            raise e

`system_type()` `staticmethod`

Attributes:

Name	Type	Description
`SystemType`	`Environment`	Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iot_hub.py

@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

`read_batch()`

Reads batch data from IoT Hubs.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iot_hub.py

def read_batch(self) -> DataFrame:
    """
    Reads batch data from IoT Hubs.
    """
    iothub_connection_string = "eventhubs.connectionString"
    try:
        if iothub_connection_string in self.options:
            sc = self.spark.sparkContext
            self.options[iothub_connection_string] = (
                sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(
                    self.options[iothub_connection_string]
                )
            )

        return self.spark.read.format("eventhubs").options(**self.options).load()

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

`read_stream()`

Reads streaming data from IoT Hubs.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iot_hub.py

def read_stream(self) -> DataFrame:
    """
    Reads streaming data from IoT Hubs.
    """
    iothub_connection_string = "eventhubs.connectionString"
    try:
        if iothub_connection_string in self.options:
            sc = self.spark.sparkContext
            self.options[iothub_connection_string] = (
                sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(
                    self.options[iothub_connection_string]
                )
            )

        return (
            self.spark.readStream.format("eventhubs").options(**self.options).load()
        )

    except Py4JJavaError as e:
        logging.exception(e.errmsg)
        raise e
    except Exception as e:
        logging.exception(str(e))
        raise e

Read from an IoT Hub

SparkIoThubSource

Example

system_type() staticmethod

read_batch()

read_stream()

`SparkIoThubSource`

`system_type()` `staticmethod`

`read_batch()`

`read_stream()`