26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257 | class BaseRawToMDMTransformer(TransformerInterface):
"""
Base class for all the Raw to Meters Data Model Transformers.
Meters Data Model requires two outputs:
- `UsageData` : To store measurement(value) as timeseries data.
- `MetaData` : To store meters related meta information.
It supports the generation of both the outputs as they share some common properties.
Parameters:
spark (SparkSession): Spark Session instance.
data (DataFrame): Dataframe containing the raw MISO data.
output_type (str): Must be one of `usage` or `meta`.
name (str): Set this to override default `name` column.
description (str): Set this to override default `description` column.
value_type (ValueType): Set this to override default `value_type` column.
version (str): Set this to override default `version` column.
series_id (str): Set this to override default `series_id` column.
series_parent_id (str): Set this to override default `series_parent_id` column.
"""
spark: SparkSession
data: DataFrame
output_type: str
input_schema: StructType
target_schema: StructType
uid_col: str
series_id_col: str
timestamp_col: str
interval_timestamp_col: str
value_col: str
series_parent_id_col: str
name_col: str
uom_col: str
description_col: str
timestamp_start_col: str
timestamp_end_col: str
time_zone_col: str
version_col: str
series_type: SeriesType
model_type: ModelType
value_type: ValueType
properties_col: str
def __init__(
self,
spark: SparkSession,
data: DataFrame,
output_type: str,
name: str = None,
description: str = None,
value_type: ValueType = None,
version: str = None,
series_id: str = None,
series_parent_id: str = None,
):
self.spark = spark
self.data = data
self.output_type = output_type
self.name = name if name is not None else self.name_col
self.description = (
description if description is not None else self.description_col
)
self.value_type = value_type if value_type is not None else self.value_type
self.version = version if version is not None else self.version_col
self.series_id = series_id if series_id is not None else self.series_id_col
self.series_parent_id = (
series_parent_id
if series_parent_id is not None
else self.series_parent_id_col
)
@staticmethod
def system_type():
return SystemType.PYSPARK
@staticmethod
def libraries():
libraries = Libraries()
return libraries
@staticmethod
def settings() -> dict:
return {}
def pre_transform_validation(self) -> bool:
valid_output_types = ["usage", "meta"]
if self.output_type not in valid_output_types:
raise ValueError(
f"Invalid output_type `{self.output_type}` given. Must be one of {valid_output_types}"
)
assert str(self.data.schema) == str(self.input_schema)
assert type(self.series_type).__name__ == SeriesType.__name__
assert type(self.model_type).__name__ == ModelType.__name__
assert type(self.value_type).__name__ == ValueType.__name__
return True
def post_transform_validation(self) -> bool:
assert str(self.data.schema) == str(self.target_schema)
return True
def _get_transformed_df(self) -> DataFrame:
if self.output_type == "usage":
self.target_schema = MDM_USAGE_SCHEMA
return self._get_usage_transformed_df()
else:
self.target_schema = MDM_META_SCHEMA
return self._get_meta_transformed_df()
def _convert_into_target_schema(self) -> None:
"""
Converts a Spark DataFrame structure into new structure based on the Target Schema.
Returns: Nothing.
"""
df: DataFrame = self.data
df = df.select(self.target_schema.names)
for field in self.target_schema.fields:
df = df.withColumn(field.name, col(field.name).cast(field.dataType))
self.data = self.spark.createDataFrame(df.rdd, self.target_schema)
def transform(self) -> DataFrame:
"""
Returns:
DataFrame: A dataframe with the raw data converted into MDM.
"""
self.pre_transform_validation()
self.data = self._get_transformed_df()
self._convert_into_target_schema()
self.post_transform_validation()
return self.data
def _add_uid_column(self, df: DataFrame) -> DataFrame:
return df.withColumn("Uid", expr(self.uid_col))
def _add_series_id_column(self, df: DataFrame) -> DataFrame:
return df.withColumn("SeriesId", expr(self.series_id))
def _add_timestamp_column(self, df: DataFrame) -> DataFrame:
return df.withColumn("Timestamp", expr(self.timestamp_col))
def _add_interval_timestamp_column(self, df: DataFrame) -> DataFrame:
return df.withColumn("IntervalTimestamp", expr(self.interval_timestamp_col))
def _add_value_column(self, df: DataFrame) -> DataFrame:
return df.withColumn("Value", expr(self.value_col))
def _add_series_parent_id_column(self, df: DataFrame) -> DataFrame:
return df.withColumn("SeriesParentId", expr(self.series_parent_id))
def _add_name_column(self, df: DataFrame) -> DataFrame:
return df.withColumn("Name", expr(self.name))
def _add_uom_column(self, df: DataFrame) -> DataFrame:
return df.withColumn("Uom", expr(self.uom_col))
def _add_description_column(self, df: DataFrame) -> DataFrame:
return df.withColumn("Description", expr(self.description))
def _add_timestamp_start_column(self, df: DataFrame) -> DataFrame:
return df.withColumn("TimestampStart", expr(self.timestamp_start_col))
def _add_timestamp_end_column(self, df: DataFrame) -> DataFrame:
return df.withColumn("TimestampEnd", expr(self.timestamp_end_col))
def _add_time_zone_column(self, df: DataFrame) -> DataFrame:
return df.withColumn("Timezone", expr(self.time_zone_col))
def _add_version_column(self, df: DataFrame) -> DataFrame:
return df.withColumn("Version", expr(self.version))
def _add_series_type_column(self, df: DataFrame) -> DataFrame:
return df.withColumn("SeriesType", lit(self.series_type.value))
def _add_model_type_column(self, df: DataFrame) -> DataFrame:
return df.withColumn("ModelType", lit(self.model_type.value))
def _add_value_type_column(self, df: DataFrame) -> DataFrame:
return df.withColumn("ValueType", lit(self.value_type.value))
def _add_properties_column(self, df: DataFrame) -> DataFrame:
return df.withColumn("Properties", expr(self.properties_col))
def _pre_process(self) -> DataFrame:
return self.data
@staticmethod
def _post_process(df: DataFrame) -> DataFrame:
return df
def _get_usage_transformed_df(self) -> DataFrame:
df = self._pre_process()
df = self._add_uid_column(df)
df = self._add_series_id_column(df)
df = self._add_timestamp_column(df)
df = self._add_interval_timestamp_column(df)
df = self._add_value_column(df)
df = self._post_process(df)
return df
def _get_meta_transformed_df(self) -> DataFrame:
df = self._pre_process()
df = self._add_uid_column(df)
df = self._add_series_id_column(df)
df = self._add_series_parent_id_column(df)
df = self._add_name_column(df)
df = self._add_uom_column(df)
df = self._add_description_column(df)
df = self._add_timestamp_start_column(df)
df = self._add_timestamp_end_column(df)
df = self._add_time_zone_column(df)
df = self._add_version_column(df)
df = self._add_series_type_column(df)
df = self._add_model_type_column(df)
df = self._add_value_type_column(df)
df = self._add_properties_column(df)
df = self._post_process(df)
return df
|