|
14 | 14 | "source": [
|
15 | 15 | "In this Tutorial we will:\n",
|
16 | 16 | " - Connect to Exasol SaaS from AzureML\n",
|
| 17 | + " - Preprocess data\n", |
17 | 18 | " - Export Exasol tables to an Azure Blobstore Container\n",
|
18 | 19 | " - Create a Datastore"
|
19 | 20 | ],
|
|
125 | 126 | }
|
126 | 127 | }
|
127 | 128 | },
|
| 129 | + { |
| 130 | + "cell_type": "markdown", |
| 131 | + "source": [ |
| 132 | + "## Data Preprocessing\n", |
| 133 | + "explanation\n", |
| 134 | + " - why here not in azure\n", |
| 135 | + " - what gets done\n", |
| 136 | + "\n", |
| 137 | + "\"There are two things we need to do:\n", |
| 138 | + "\n", |
| 139 | + " Split into train and validation data\n", |
| 140 | + " Replace CLASS column by a column with boolean values\n", |
| 141 | + "\n", |
| 142 | + "For the split we add a column SPLIT that has a random value between 0 and 1, so we can partition the data by a condition on that column.\n", |
| 143 | + "\n", |
| 144 | + "In addition, we replace the CLASS with the text values pos and neg by a new column CLASS_POS with boolean values.\"\n", |
| 145 | + " - mention test table to" |
| 146 | + ], |
| 147 | + "metadata": { |
| 148 | + "collapsed": false |
| 149 | + } |
| 150 | + }, |
| 151 | + { |
| 152 | + "cell_type": "code", |
| 153 | + "execution_count": null, |
| 154 | + "outputs": [], |
| 155 | + "source": [ |
| 156 | + "all_columns = exasol.export_to_pandas(\"SELECT * FROM IDA.TRAIN LIMIT 1;\")\n", |
| 157 | + "column_names = list(all_columns)\n", |
| 158 | + "column_names.remove(\"CLASS\")\n", |
| 159 | + "exasol.execute(\"\"\"CREATE OR REPLACE TABLE IDA.TRAIN_PREPARED AS (\n", |
| 160 | + " SELECT RANDOM() AS SPLIT,\n", |
| 161 | + " (CLASS = 'pos') as CLASS_POS, {all_columns_except_class!q} FROM IDA.TRAIN)\"\"\",\n", |
| 162 | + " {\"all_columns_except_class\": column_names})\n", |
| 163 | + "\n", |
| 164 | + "\n", |
| 165 | + "\n", |
| 166 | + "exasol.export_to_pandas(\"SELECT * FROM IDA.TRAIN_PREPARED LIMIT 4\")" |
| 167 | + ], |
| 168 | + "metadata": { |
| 169 | + "collapsed": false, |
| 170 | + "pycharm": { |
| 171 | + "name": "#%%\n" |
| 172 | + } |
| 173 | + } |
| 174 | + }, |
| 175 | + { |
| 176 | + "cell_type": "code", |
| 177 | + "execution_count": null, |
| 178 | + "outputs": [], |
| 179 | + "source": [ |
| 180 | + "exasol.execute(\"\"\"CREATE OR REPLACE TABLE IDA.TEST_PREPARED AS (\n", |
| 181 | + " SELECT\n", |
| 182 | + " (CLASS = 'pos') as CLASS_POS, {all_columns_except_class!q} FROM IDA.TEST)\"\"\",\n", |
| 183 | + " {\"all_columns_except_class\": column_names})\n", |
| 184 | + "\n", |
| 185 | + "\n", |
| 186 | + "\n", |
| 187 | + "exasol.export_to_pandas(\"SELECT * FROM IDA.TEST_PREPARED LIMIT 4\")" |
| 188 | + ], |
| 189 | + "metadata": { |
| 190 | + "collapsed": false, |
| 191 | + "pycharm": { |
| 192 | + "name": "#%%\n" |
| 193 | + } |
| 194 | + } |
| 195 | + }, |
128 | 196 | {
|
129 | 197 | "cell_type": "markdown",
|
130 | 198 | "source": [
|
|
170 | 238 | "collapsed": false
|
171 | 239 | }
|
172 | 240 | },
|
| 241 | + { |
| 242 | + "cell_type": "markdown", |
| 243 | + "source": [ |
| 244 | + "## todo\n", |
| 245 | + "- change and add explanation to preprocessing\n", |
| 246 | + "- update image of loaded tables (reload without split column beforehand)\n", |
| 247 | + "- add notze about selecing columns\n", |
| 248 | + "- add note about importing more than once -> appends not make new file!" |
| 249 | + ], |
| 250 | + "metadata": { |
| 251 | + "collapsed": false |
| 252 | + } |
| 253 | + }, |
| 254 | + { |
| 255 | + "cell_type": "code", |
| 256 | + "execution_count": null, |
| 257 | + "outputs": [], |
| 258 | + "source": [ |
| 259 | + "table = \"TEST_PREPARED\"\n", |
| 260 | + "column_names = ['CLASS_POS', 'AA_000', 'AG_005', 'AH_000', 'AL_000', 'AM_0', 'AN_000', 'AO_000', 'AP_000', 'AQ_000',\n", |
| 261 | + " 'AZ_004', 'BA_002', 'BB_000', 'BC_000', 'BD_000', 'BE_000',\n", |
| 262 | + " 'BF_000', 'BG_000', 'BH_000', 'BI_000', 'BJ_000', 'BS_000', 'BT_000', 'BU_000', 'BV_000',\n", |
| 263 | + " 'BX_000', 'BY_000', 'BZ_000', 'CA_000', 'CB_000', 'CC_000', 'CI_000', 'CN_004', 'CQ_000',\n", |
| 264 | + " 'CS_001', 'DD_000', 'DE_000', 'DN_000', 'DS_000', 'DU_000', 'DV_000', 'EB_000', 'EE_005']\n", |
| 265 | + "\n", |
| 266 | + "blobstorage_name = \"azureml-tutorial\" # change, remember to you might need to remove the \"_datastore\" suffix\n", |
| 267 | + "save_path = f'{blobstorage_name}/ida/{table}'\n", |
| 268 | + "sql_export = \"EXPORT (SELECT {column_names!q}\" + f\" FROM IDA.{table}) INTO CSV AT CLOUD AZURE BLOBSTORAGE 'DefaultEndpointsProtocol=https;EndpointSuffix=core.windows.net'\"\\\n", |
| 269 | + " f\"USER '{my_storage_account_name}' IDENTIFIED BY '{credentials.account_key}' FILE '{save_path}' WITH COLUMN NAMES\"\n", |
| 270 | + "exasol.execute(sql_export, {\"column_names\": column_names})\n", |
| 271 | + "print(f\"saved {table} in file {save_path}\")" |
| 272 | + ], |
| 273 | + "metadata": { |
| 274 | + "collapsed": false, |
| 275 | + "pycharm": { |
| 276 | + "name": "#%%\n" |
| 277 | + } |
| 278 | + } |
| 279 | + }, |
| 280 | + { |
| 281 | + "cell_type": "code", |
| 282 | + "execution_count": null, |
| 283 | + "outputs": [], |
| 284 | + "source": [ |
| 285 | + "\n", |
| 286 | + "table = \"TRAIN_PREPARED\"\n", |
| 287 | + "save_path = f'{blobstorage_name}/ida/{table}'\n", |
| 288 | + "sql_export = \"EXPORT (SELECT {column_names!q}\" + f\" FROM IDA.{table} WHERE SPLIT <= 0.8) INTO CSV AT CLOUD AZURE BLOBSTORAGE 'DefaultEndpointsProtocol=https;EndpointSuffix=core.windows.net'\"\\\n", |
| 289 | + " f\"USER '{my_storage_account_name}' IDENTIFIED BY '{credentials.account_key}' FILE '{save_path}' WITH COLUMN NAMES\"\n", |
| 290 | + "exasol.execute(sql_export, {\"column_names\": column_names})\n", |
| 291 | + "print(f\"saved {table} in file {save_path}\")\n", |
| 292 | + "\n", |
| 293 | + "save_path = f'{blobstorage_name}/ida/VALIDATE_PREPARED'\n", |
| 294 | + "sql_export = \"EXPORT (SELECT {column_names!q}\" + f\" FROM IDA.{table} WHERE SPLIT > 0.8) INTO CSV AT CLOUD AZURE BLOBSTORAGE 'DefaultEndpointsProtocol=https;EndpointSuffix=core.windows.net'\"\\\n", |
| 295 | + " f\"USER '{my_storage_account_name}' IDENTIFIED BY '{credentials.account_key}' FILE '{save_path}' WITH COLUMN NAMES\"\n", |
| 296 | + "exasol.execute(sql_export, {\"column_names\": column_names})\n", |
| 297 | + "print(f\"saved {table} in file {save_path}\")" |
| 298 | + ], |
| 299 | + "metadata": { |
| 300 | + "collapsed": false, |
| 301 | + "pycharm": { |
| 302 | + "name": "#%%\n" |
| 303 | + } |
| 304 | + } |
| 305 | + }, |
173 | 306 | {
|
174 | 307 | "cell_type": "code",
|
175 | 308 | "execution_count": null,
|
176 | 309 | "outputs": [],
|
177 | 310 | "source": [
|
178 |
| - "azure_storage_container_name = \"your-container-name\" # change, remember to you might need to remove the \"_datastore\" suffix\n", |
179 |
| - "\n", |
180 |
| - "for table in [\"TEST\", \"TRAIN\"]:\n", |
181 |
| - " save_path = f'{azure_storage_container_name}/ida/{table}'\n", |
182 |
| - " sql_export = f\"EXPORT TABLE IDA.{table} INTO CSV AT CLOUD AZURE BLOBSTORAGE 'DefaultEndpointsProtocol=https;EndpointSuffix=core.windows.net'\"\\\n", |
183 |
| - " f\"USER '{my_storage_account_name}' IDENTIFIED BY '{credentials.account_key}' FILE '{save_path}'\"\n", |
184 |
| - " exasol.execute(sql_export)\n", |
185 |
| - " print(f\"Saved {table} in file {save_path}\")\n" |
| 311 | + "for table in [\"TRAIN_PREPARED\", \"TEST_PREPARED\"]:\n", |
| 312 | + " exasol.execute(f\"DROP TABLE IDA.{table};\")" |
186 | 313 | ],
|
187 | 314 | "metadata": {
|
188 | 315 | "collapsed": false,
|
|
0 commit comments