diff --git a/AMLNotebooks/01_Create_CreditRisk_AML_Pipeline.ipynb b/AMLNotebooks/01_Create_CreditRisk_AML_Pipeline.ipynb index 185fbaa..035799c 100644 --- a/AMLNotebooks/01_Create_CreditRisk_AML_Pipeline.ipynb +++ b/AMLNotebooks/01_Create_CreditRisk_AML_Pipeline.ipynb @@ -1,5 +1,15 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "82095a0d", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft. All rights reserved.\n", + "\n", + "Licensed under the MIT license." + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/AMLNotebooks/02_Create_CreditRisk_AML_Pipeline_Lineage.ipynb b/AMLNotebooks/02_Create_CreditRisk_AML_Pipeline_Lineage.ipynb index 4ade05c..01cf1a8 100644 --- a/AMLNotebooks/02_Create_CreditRisk_AML_Pipeline_Lineage.ipynb +++ b/AMLNotebooks/02_Create_CreditRisk_AML_Pipeline_Lineage.ipynb @@ -1,5 +1,14 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft. All rights reserved.\n", + "\n", + "Licensed under the MIT license." + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/AMLNotebooks/Authenticate_to_Purview_AML.py b/AMLNotebooks/Authenticate_to_Purview_AML.py index 25da686..cc4dc24 100644 --- a/AMLNotebooks/Authenticate_to_Purview_AML.py +++ b/AMLNotebooks/Authenticate_to_Purview_AML.py @@ -1,3 +1,7 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. + def authentitae_to_purview_AML(): from pyapacheatlas.auth import ServicePrincipalAuthentication from pyapacheatlas.core import PurviewClient, AtlasClassification, AtlasEntity, AtlasProcess diff --git a/AMLNotebooks/Create_ML_Lineage_Functions.py b/AMLNotebooks/Create_ML_Lineage_Functions.py index c60cfbc..17b9707 100644 --- a/AMLNotebooks/Create_ML_Lineage_Functions.py +++ b/AMLNotebooks/Create_ML_Lineage_Functions.py @@ -1,4 +1,8 @@ # + +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. + from pyapacheatlas.auth import ServicePrincipalAuthentication from pyapacheatlas.core import PurviewClient, AtlasClassification, AtlasEntity, AtlasProcess from pyapacheatlas.readers import ExcelConfiguration, ExcelReader diff --git a/AMLNotebooks/Create_ML_Lineage_Types.py b/AMLNotebooks/Create_ML_Lineage_Types.py index 68cc8bd..f65ca23 100644 --- a/AMLNotebooks/Create_ML_Lineage_Types.py +++ b/AMLNotebooks/Create_ML_Lineage_Types.py @@ -1,3 +1,7 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. + def create_ml_lineage_types(client): from pyapacheatlas.core.typedef import AtlasAttributeDef, EntityTypeDef, RelationshipTypeDef try: diff --git a/SynapseNotebooks/01_Authenticate_to_Purview_AML.ipynb b/SynapseNotebooks/01_Authenticate_to_Purview_AML.ipynb index 3c8a6ad..fbac7a7 100644 --- a/SynapseNotebooks/01_Authenticate_to_Purview_AML.ipynb +++ b/SynapseNotebooks/01_Authenticate_to_Purview_AML.ipynb @@ -1,260 +1,265 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyapacheatlas.auth import ServicePrincipalAuthentication\n", - "from pyapacheatlas.core import PurviewClient, AtlasClassification, AtlasEntity, AtlasProcess \n", - "from pyapacheatlas.readers import ExcelConfiguration, ExcelReader\n", - "from pyapacheatlas.core.util import GuidTracker\n", - "from pyapacheatlas.core import AtlasAttributeDef, AtlasEntity, PurviewClient\n", - "from pyapacheatlas.core.typedef import EntityTypeDef" - ] + "cells": [ + { + "cell_type": "markdown", + "source": [ + "Copyright (c) Microsoft. All rights reserved.\r\n", + "\r\n", + "Licensed under the MIT license." + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "from pyapacheatlas.auth import ServicePrincipalAuthentication\n", + "from pyapacheatlas.core import PurviewClient, AtlasClassification, AtlasEntity, AtlasProcess \n", + "from pyapacheatlas.readers import ExcelConfiguration, ExcelReader\n", + "from pyapacheatlas.core.util import GuidTracker\n", + "from pyapacheatlas.core import AtlasAttributeDef, AtlasEntity, PurviewClient\n", + "from pyapacheatlas.core.typedef import EntityTypeDef" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "# get SPN details you created in step 2.1 of solution accelerator setup\n", + "tenant_id = \"\"\n", + "client_id = \"\"\n", + "client_secret = \"\"\n", + "\n", + "# get Purview account name from azure portal\n", + "purview_name = \"\"\n" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "# get AML workspace details from azure portal\n", + "subscription_id = \"\" \n", + "resource_group = \"\"\n", + "workspace_name = \"\"\n", + "workspace_region = \"\"\n" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "from pyapacheatlas.auth import ServicePrincipalAuthentication\n", + "from pyapacheatlas.core import PurviewClient\n", + "from pyapacheatlas.core.util import GuidTracker\n", + "\n", + "# Authenticate to your Atlas server using a Service Principal\n", + "oauth = ServicePrincipalAuthentication(\n", + " tenant_id= tenant_id,\n", + " client_id= client_id,\n", + " client_secret= client_secret\n", + ")\n", + "client = PurviewClient(\n", + " account_name = purview_name,\n", + " authentication=oauth\n", + ")\n", + "guid = GuidTracker()" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "# get SPN details you created in step 3.1 of solution accelerator setup\n", + "aml_client_id = \"\"\n", + "aml_client_secret = \"\"" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "from azureml.core.authentication import ServicePrincipalAuthentication\n", + "\n", + "sp = ServicePrincipalAuthentication(tenant_id=tenant_id, \n", + " service_principal_id=aml_client_id, \n", + " service_principal_password=aml_client_secret)\n", + "\n", + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.get(name=workspace_name,\n", + " resource_group = resource_group,\n", + " auth=sp,\n", + " subscription_id=subscription_id)" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "# We recommend you add your service principal secrets in KeyVault instead of hardcoded values above\n", + "# Create a linked service for key vault in Synapse Studio \n", + "# See below code snippet how to access secrets from KeyVault " + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "# linked_service = \"AzureKeyVault1\" # Azure Key Vault Linked Service name \n", + "# akv_name = \"\" # Azure Key Vault name\n", + "# secret_name = \"\" # Azure Key Vault Secret name\n", + "\n", + "# # Fetch the key from Azure Key Vault\n", + "# aml_spn = mssparkutils.credentials.getSecret(\n", + "# linkedService=linked_service,\n", + "# akvName=akv_name, \n", + "# secret=secret_name)\n", + "\n", + "# linked_service = \"AzureKeyVault1\" # Azure Key Vault Linked Service name \n", + "# akv_name = \"\" # Azure Key Vault name\n", + "# secret_name = \"\" # Azure Key Vault Secret name\n", + "\n", + "# # Fetch the key from Azure Key Vault\n", + "# purview_spn = mssparkutils.credentials.getSecret(\n", + "# linkedService=linked_service,\n", + "# akvName=akv_name, \n", + "# # secret=secret_name)" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "# # get SPN details you created in step 2.1 of solution accelerator installation\n", + "# tenant_id = \"\"\n", + "# purview_client_id = \"\"\n", + "# purview_name = \"\"\n", + "# purview_client_secret = purview_spn" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "# import json\n", + "# import os\n", + "# import sys\n", + "\n", + "# from pyapacheatlas.auth import ServicePrincipalAuthentication\n", + "# from pyapacheatlas.core import PurviewClient, AtlasClassification, AtlasEntity, AtlasProcess \n", + "# from pyapacheatlas.readers import ExcelConfiguration, ExcelReader\n", + "# from pyapacheatlas.core.util import GuidTracker\n", + "# from pyapacheatlas.core import AtlasAttributeDef, AtlasEntity, PurviewClient\n", + "# from pyapacheatlas.core.typedef import EntityTypeDef\n", + "\n", + "# # Authenticate to your Atlas server using a Service Principal\n", + "# oauth = ServicePrincipalAuthentication(\n", + "# tenant_id= tenant_id,\n", + "# client_id= purview_client_id,\n", + "# client_secret= purview_client_secret\n", + "# )\n", + "# client = PurviewClient(\n", + "# account_name = purview_name,\n", + "# authentication=oauth\n", + "# )\n", + "# guid = GuidTracker()" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "# # get AML workspace details from azure portal\n", + "# subscription_id = \"\" \n", + "# resource_group = \"\"\n", + "# workspace_name = \"\"" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "# # get SPN details you created in step 2.1 of solution accelerator installation\n", + "# tenant_id = \"\"\n", + "# aml_client_id = \"\"\n", + "# aml_client_secret = aml_spn" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "# # Authentiate to AML\n", + "\n", + "# from azureml.core.authentication import ServicePrincipalAuthentication\n", + "\n", + "# sp = ServicePrincipalAuthentication(tenant_id=tenant_id, \n", + "# service_principal_id=aml_client_id, \n", + "# service_principal_password=aml_client_secret) \n", + "\n", + "# from azureml.core import Workspace\n", + "\n", + "# ws = Workspace.get(name=workspace_name,\n", + "# resource_group = resource_group,\n", + "# auth=sp,\n", + "# subscription_id=subscription_id)" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + } + ], + "metadata": { + "kernelspec": { + "name": "synapse_pyspark", + "display_name": "python" + }, + "language_info": { + "name": "python" + }, + "save_output": true, + "synapse_widget": { + "version": "0.1", + "state": {} + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get SPN details you created in step 2.1 of solution accelerator setup\n", - "tenant_id = \"\"\n", - "client_id = \"\"\n", - "client_secret = \"\"\n", - "\n", - "# get Purview account name from azure portal\n", - "purview_name = \"\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get AML workspace details from azure portal\n", - "subscription_id = \"\" \n", - "resource_group = \"\"\n", - "workspace_name = \"\"\n", - "workspace_region = \"\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyapacheatlas.auth import ServicePrincipalAuthentication\n", - "from pyapacheatlas.core import PurviewClient\n", - "from pyapacheatlas.core.util import GuidTracker\n", - "\n", - "# Authenticate to your Atlas server using a Service Principal\n", - "oauth = ServicePrincipalAuthentication(\n", - " tenant_id= tenant_id,\n", - " client_id= client_id,\n", - " client_secret= client_secret\n", - ")\n", - "client = PurviewClient(\n", - " account_name = purview_name,\n", - " authentication=oauth\n", - ")\n", - "guid = GuidTracker()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get SPN details you created in step 3.1 of solution accelerator setup\n", - "aml_client_id = \"\"\n", - "aml_client_secret = \"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.authentication import ServicePrincipalAuthentication\n", - "\n", - "sp = ServicePrincipalAuthentication(tenant_id=tenant_id, \n", - " service_principal_id=aml_client_id, \n", - " service_principal_password=aml_client_secret)\n", - "\n", - "from azureml.core import Workspace\n", - "\n", - "ws = Workspace.get(name=workspace_name,\n", - " resource_group = resource_group,\n", - " auth=sp,\n", - " subscription_id=subscription_id)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# We recommend you add your service principal secrets in KeyVault instead of hardcoded values above\n", - "# Create a linked service for key vault in Synapse Studio \n", - "# See below code snippet how to access secrets from KeyVault " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# linked_service = \"AzureKeyVault1\" # Azure Key Vault Linked Service name \n", - "# akv_name = \"\" # Azure Key Vault name\n", - "# secret_name = \"\" # Azure Key Vault Secret name\n", - "\n", - "# # Fetch the key from Azure Key Vault\n", - "# aml_spn = mssparkutils.credentials.getSecret(\n", - "# linkedService=linked_service,\n", - "# akvName=akv_name, \n", - "# secret=secret_name)\n", - "\n", - "# linked_service = \"AzureKeyVault1\" # Azure Key Vault Linked Service name \n", - "# akv_name = \"\" # Azure Key Vault name\n", - "# secret_name = \"\" # Azure Key Vault Secret name\n", - "\n", - "# # Fetch the key from Azure Key Vault\n", - "# purview_spn = mssparkutils.credentials.getSecret(\n", - "# linkedService=linked_service,\n", - "# akvName=akv_name, \n", - "# # secret=secret_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# # get SPN details you created in step 2.1 of solution accelerator installation\n", - "# tenant_id = \"\"\n", - "# purview_client_id = \"\"\n", - "# purview_name = \"\"\n", - "# purview_client_secret = purview_spn" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import json\n", - "# import os\n", - "# import sys\n", - "\n", - "# from pyapacheatlas.auth import ServicePrincipalAuthentication\n", - "# from pyapacheatlas.core import PurviewClient, AtlasClassification, AtlasEntity, AtlasProcess \n", - "# from pyapacheatlas.readers import ExcelConfiguration, ExcelReader\n", - "# from pyapacheatlas.core.util import GuidTracker\n", - "# from pyapacheatlas.core import AtlasAttributeDef, AtlasEntity, PurviewClient\n", - "# from pyapacheatlas.core.typedef import EntityTypeDef\n", - "\n", - "# # Authenticate to your Atlas server using a Service Principal\n", - "# oauth = ServicePrincipalAuthentication(\n", - "# tenant_id= tenant_id,\n", - "# client_id= purview_client_id,\n", - "# client_secret= purview_client_secret\n", - "# )\n", - "# client = PurviewClient(\n", - "# account_name = purview_name,\n", - "# authentication=oauth\n", - "# )\n", - "# guid = GuidTracker()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# # get AML workspace details from azure portal\n", - "# subscription_id = \"\" \n", - "# resource_group = \"\"\n", - "# workspace_name = \"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# # get SPN details you created in step 2.1 of solution accelerator installation\n", - "# tenant_id = \"\"\n", - "# aml_client_id = \"\"\n", - "# aml_client_secret = aml_spn" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# # Authentiate to AML\n", - "\n", - "# from azureml.core.authentication import ServicePrincipalAuthentication\n", - "\n", - "# sp = ServicePrincipalAuthentication(tenant_id=tenant_id, \n", - "# service_principal_id=aml_client_id, \n", - "# service_principal_password=aml_client_secret) \n", - "\n", - "# from azureml.core import Workspace\n", - "\n", - "# ws = Workspace.get(name=workspace_name,\n", - "# resource_group = resource_group,\n", - "# auth=sp,\n", - "# subscription_id=subscription_id)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - }, - "save_output": true, - "synapse_widget": { - "state": {}, - "version": "0.1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/SynapseNotebooks/02_Create_ML_Lineage_Types.ipynb b/SynapseNotebooks/02_Create_ML_Lineage_Types.ipynb index cc0a005..e33c759 100644 --- a/SynapseNotebooks/02_Create_ML_Lineage_Types.ipynb +++ b/SynapseNotebooks/02_Create_ML_Lineage_Types.ipynb @@ -1,368 +1,373 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + "cells": [ + { + "cell_type": "markdown", + "source": [ + "Copyright (c) Microsoft. All rights reserved.\r\n", + "\r\n", + "Licensed under the MIT license." + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } }, - "nteract": { - "transient": { - "deleting": false - } + { + "cell_type": "code", + "source": [ + "from pyapacheatlas.core.typedef import AtlasAttributeDef, EntityTypeDef, RelationshipTypeDef\n", + "\n", + "try:\n", + " #-----------------------------------------------------------------------------------# \n", + " #create custom dataset type\n", + " type_df = EntityTypeDef(\n", + " name=\"custom_dataset\",\n", + " attributeDefs=[\n", + " AtlasAttributeDef(name=\"format\")\n", + " ],\n", + " superTypes = [\"DataSet\"]\n", + " )\n", + " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", + "\n", + " #-----------------------------------------------------------------------------------# \n", + " #create process with column mapping type\n", + " type_df = EntityTypeDef(\n", + " name=\"ProcessWithColumnMapping\",\n", + " attributeDefs=[\n", + " AtlasAttributeDef(name=\"columnMapping\")\n", + " ],\n", + " superTypes = [\"Process\"]\n", + " )\n", + " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", + "\n", + " #-----------------------------------------------------------------------------------# \n", + " #create AML workspace type\n", + " type_df = EntityTypeDef(\n", + " name=\"custom_ml_workspace\",\n", + " attributeDefs=[\n", + " AtlasAttributeDef(name='name',typename='string'),\n", + " AtlasAttributeDef(name='description',typename='string'),\n", + " AtlasAttributeDef(name='subscription_id',typename='string'),\n", + " AtlasAttributeDef(name='resource_group',typename='string')\n", + " ],\n", + " superTypes = [\"DataSet\"]\n", + " )\n", + " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", + " #-----------------------------------------------------------------------------------# \n", + " #create types for datastore and dataset\n", + "\n", + " #create AML datastore type\n", + " datastore_type_df = EntityTypeDef(\n", + " name=\"custom_ml_datastore\",\n", + " attributeDefs=[\n", + " AtlasAttributeDef(name=\"name\",typename='string'),\n", + " AtlasAttributeDef(name='container_name',typename='string'),\n", + " AtlasAttributeDef(name='account_name',typename='string'),\n", + " AtlasAttributeDef(name='protocol',typename='string'),\n", + " AtlasAttributeDef(name='endpoint',typename='string'),\n", + " AtlasAttributeDef(name='server_name',typename='string'),\n", + " AtlasAttributeDef(name='database_name',typename='string'),\n", + " AtlasAttributeDef(name=\"createdby\",typename='string')\n", + " ],\n", + " superTypes = [\"DataSet\"],\n", + " options = {\"schemaElementAttribute\":\"dataset\"}\n", + " )\n", + "\n", + " #create AML dataset type\n", + " dataset_type_df = EntityTypeDef(\n", + " name=\"custom_ml_dataset\",\n", + " attributeDefs=[\n", + " AtlasAttributeDef(name=\"name\",typename='string'),\n", + " AtlasAttributeDef(name=\"description\",typename='string'),\n", + " AtlasAttributeDef(name=\"createdby\",typename='string'),\n", + " AtlasAttributeDef(name=\"createdtime\",typename='string')\n", + " ],\n", + " superTypes = [\"DataSet\"]\n", + " )\n", + "\n", + " # create relationsip between datastore and dataset\n", + " dataset_to_datastore_relationship = RelationshipTypeDef(\n", + " name=\"custom_ml_datastore_to_dataset\",\n", + " relationshipCategory=\"COMPOSITION\",\n", + " endDef1={\n", + " \"type\": \"custom_ml_datastore\",\n", + " \"name\": \"dataset\",\n", + " \"isContainer\": True,\n", + " \"cardinality\": \"SET\",\n", + " \"isLegacyAttribute\": False\n", + " },\n", + " endDef2={\n", + " \"type\": \"custom_ml_dataset\",\n", + " \"name\": \"datastore\",\n", + " \"isContainer\": False,\n", + " \"cardinality\": \"SINGLE\",\n", + " \"isLegacyAttribute\": False\n", + " }\n", + " )\n", + "\n", + " typedef_results = client.upload_typedefs(\n", + " entityDefs = [datastore_type_df, dataset_type_df],\n", + " relationshipDefs = [dataset_to_datastore_relationship],\n", + " force_update=True\n", + " )\n", + " #-----------------------------------------------------------------------------------# \n", + " #create types for experiment and experimentstep\n", + " \n", + " #create process for Ml Experiment Step\n", + " exp_type_df = EntityTypeDef(\n", + " name=\"custom_ml_experiment\",\n", + " attributeDefs=[\n", + " AtlasAttributeDef(name='name',typename='string'),\n", + " AtlasAttributeDef(name='notes',typename='string'),\n", + " AtlasAttributeDef(name=\"createdby\",typename='string'),\n", + " AtlasAttributeDef(name=\"createdtime\",typename='string')\n", + " ],\n", + " superTypes = [\"Process\"]\n", + " )\n", + "\n", + " #create process for Ml Experiment Step\n", + " exp_step_type_df = EntityTypeDef(\n", + " name=\"custom_ml_experiment_step\",\n", + " attributeDefs=[\n", + " AtlasAttributeDef(name='notes',typename='string')\n", + " ],\n", + " superTypes = [\"Process\"]\n", + " )\n", + "\n", + " # create relationsip between experiment and experimentstep\n", + " step_to_exp_relationship = RelationshipTypeDef(\n", + " name=\"custom_ml_experiment_to_experimentstep\",\n", + " relationshipCategory=\"COMPOSITION\",\n", + " endDef1={\n", + " \"type\": \"custom_ml_experiment\",\n", + " \"name\": \"experimentstep\",\n", + " \"isContainer\": True,\n", + " \"cardinality\": \"SET\",\n", + " \"isLegacyAttribute\": False\n", + " },\n", + " endDef2={\n", + " \"type\": \"custom_ml_experiment_step\",\n", + " \"name\": \"experiment\",\n", + " \"isContainer\": False,\n", + " \"cardinality\": \"SINGLE\",\n", + " \"isLegacyAttribute\": False\n", + " }\n", + " )\n", + "\n", + " typedef_results = client.upload_typedefs(\n", + " entityDefs = [exp_type_df, exp_step_type_df],\n", + " relationshipDefs = [step_to_exp_relationship],\n", + " force_update=True\n", + " )\n", + " #-----------------------------------------------------------------------------------# \n", + " \n", + " rd = RelationshipTypeDef(\n", + " name=\"custom_ml_workspace_datastore\",\n", + " attributeDefs=[],\n", + " relationshipCategory=\"COMPOSITION\", # Means the child can't exist without the parent\n", + " endDef1={ # endDef1 decribes what the parent will have as an attribute\n", + " \"type\":\"custom_ml_workspace\", # Type of the parent\n", + " \"name\":\"datastores\", # What the parent will have\n", + " \"isContainer\": True,\n", + " \"cardinality\":\"SET\", # This is related to the cardinality, in this case the parent Server will have a SET of Models.\n", + " \"isLegacyAttribute\":False\n", + " },\n", + " endDef2={ # endDef2 decribes what the child will have as an attribute\n", + " \"type\":\"custom_ml_datastore\", # Type of the child\n", + " \"name\":\"workspace\", # What the child will have\n", + " \"isContainer\":False,\n", + " \"cardinality\":\"SINGLE\",\n", + " \"isLegacyAttribute\":False\n", + " }\n", + " )\n", + " client.upload_typedefs(relationshipDefs=[rd])\n", + " \n", + " #-----------------------------------------------------------------------------------# \n", + " rd = RelationshipTypeDef(\n", + " name=\"custom_ml_workspace_experiment\",\n", + " attributeDefs=[],\n", + " relationshipCategory=\"COMPOSITION\", # Means the child can't exist without the parent\n", + " endDef1={ # endDef1 decribes what the parent will have as an attribute\n", + " \"type\":\"custom_ml_workspace\", # Type of the parent\n", + " \"name\":\"experiments\", # What the parent will have\n", + " \"isContainer\": True,\n", + " \"cardinality\":\"SET\", # This is related to the cardinality, in this case the parent Server will have a SET of Models.\n", + " \"isLegacyAttribute\":False\n", + " },\n", + " endDef2={ # endDef2 decribes what the child will have as an attribute\n", + " \"type\":\"custom_ml_experiment\", # Type of the child\n", + " \"name\":\"workspace\", # What the child will have\n", + " \"isContainer\":False,\n", + " \"cardinality\":\"SINGLE\",\n", + " \"isLegacyAttribute\":False\n", + " }\n", + " )\n", + " client.upload_typedefs(relationshipDefs=[rd])\n", + "\n", + " #-----------------------------------------------------------------------------------# \n", + " #create types for packages and package\n", + "\n", + " #create packages type\n", + " packages_type_df = EntityTypeDef(\n", + " name=\"custom_ml_packages\",\n", + " attributeDefs=[\n", + " AtlasAttributeDef(name='notes',typename='string')\n", + " ],\n", + " superTypes = [\"DataSet\"],\n", + " options = {\"schemaElementAttribute\":\"package\"}\n", + " )\n", + "\n", + " package_type_df = EntityTypeDef(\n", + " name=\"custom_ml_package\",\n", + " attributeDefs=[\n", + " AtlasAttributeDef(name='programming_language',typename='string'),\n", + " AtlasAttributeDef(name='package_name',typename='string'),\n", + " AtlasAttributeDef(name='version',typename='string'),\n", + " AtlasAttributeDef(name='notes',typename='string')\n", + " ],\n", + " superTypes = [\"DataSet\"]\n", + " )\n", + "\n", + " # create relationsip between packages and package\n", + " package_to_packages_relationship = RelationshipTypeDef(\n", + " name=\"custom_ml_packages_to_package\",\n", + " relationshipCategory=\"COMPOSITION\",\n", + " endDef1={\n", + " \"type\": \"custom_ml_packages\",\n", + " \"name\": \"package\",\n", + " \"isContainer\": True,\n", + " \"cardinality\": \"SET\",\n", + " \"isLegacyAttribute\": False\n", + " },\n", + " endDef2={\n", + " \"type\": \"custom_ml_package\",\n", + " \"name\": \"packages\",\n", + " \"isContainer\": False,\n", + " \"cardinality\": \"SINGLE\",\n", + " \"isLegacyAttribute\": False\n", + " }\n", + " )\n", + "\n", + " typedef_results = client.upload_typedefs(\n", + " entityDefs = [packages_type_df, package_type_df],\n", + " relationshipDefs = [package_to_packages_relationship],\n", + " force_update=True\n", + " )\n", + " #-----------------------------------------------------------------------------------# \n", + " \n", + " #create experiemnt config type\n", + " type_df = EntityTypeDef(\n", + " name=\"custom_ml_exp_config\",\n", + " attributeDefs=[\n", + " AtlasAttributeDef(name='task_type',typename='string'),\n", + " AtlasAttributeDef(name='enable_early_stopping',typename='bool'),\n", + " AtlasAttributeDef(name='experiment_timeout_minutes',typename='int'),\n", + " AtlasAttributeDef(name='primary_metric',typename='string'),\n", + " AtlasAttributeDef(name='compute_target',typename='string'),\n", + " AtlasAttributeDef(name='label_column_name',typename='string'),\n", + " AtlasAttributeDef(name='n_cross_validations',typename='int'),\n", + " AtlasAttributeDef(name='model_explainability',typename='bool')\n", + " ],\n", + " superTypes = [\"DataSet\"]\n", + " )\n", + " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", + " \n", + " #-----------------------------------------------------------------------------------# \n", + "\n", + " #create model metrics type\n", + " type_df = EntityTypeDef(\n", + " name=\"custom_ml_model_metrics\",\n", + " attributeDefs=[\n", + " AtlasAttributeDef(name='AUC',typename='float'),\n", + " AtlasAttributeDef(name='Accuracy',typename='float'),\n", + " AtlasAttributeDef(name='Precision',typename='float'),\n", + " AtlasAttributeDef(name='Recall',typename='float'),\n", + " AtlasAttributeDef(name='F1',typename='float')\n", + " ],\n", + " superTypes = [\"DataSet\"]\n", + " )\n", + " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", + " \n", + " #-----------------------------------------------------------------------------------# \n", + "\n", + " #create model type\n", + " type_df = EntityTypeDef(\n", + " name=\"custom_ml_model\",\n", + " attributeDefs=[\n", + " AtlasAttributeDef(name='workspace_name',typename='string'),\n", + " AtlasAttributeDef(name='workspace_subscription_id',typename='string'),\n", + " AtlasAttributeDef(name='workspace_resource_group',typename='string'),\n", + " AtlasAttributeDef(name='name',typename='string'),\n", + " AtlasAttributeDef(name='id',typename='string'),\n", + " AtlasAttributeDef(name='version',typename='string'),\n", + " AtlasAttributeDef(name='tags',typename='string'),\n", + " AtlasAttributeDef(name='properties',typename='string')\n", + " ],\n", + " superTypes = [\"DataSet\"]\n", + " )\n", + " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", + " \n", + " #-----------------------------------------------------------------------------------# \n", + "\n", + " #create endpoint type\n", + " type_df = EntityTypeDef(\n", + " name=\"custom_ml_model_endpoint\",\n", + " attributeDefs=[\n", + " AtlasAttributeDef(name='workspace_name',typename='string'),\n", + " AtlasAttributeDef(name='workspace_subscription_id',typename='string'),\n", + " AtlasAttributeDef(name='workspace_resource_group',typename='string'),\n", + " AtlasAttributeDef(name='name',typename='string'),\n", + " AtlasAttributeDef(name='image_id',typename='string'),\n", + " AtlasAttributeDef(name='compute_type',typename='string'),\n", + " AtlasAttributeDef(name='state',typename='string'),\n", + " AtlasAttributeDef(name='scoring_uri',typename='string'),\n", + " AtlasAttributeDef(name='tags',typename='string'),\n", + " AtlasAttributeDef(name='state',typename='string'),\n", + " AtlasAttributeDef(name='properties',typename='string'),\n", + " AtlasAttributeDef(name='created_by',typename='string'),\n", + " AtlasAttributeDef(name='sample_json',typename='string')\n", + " ],\n", + " superTypes = [\"DataSet\"]\n", + " )\n", + " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", + " \n", + " #-----------------------------------------------------------------------------------# \n", + "except:\n", + " print('types already created') " + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + } + ], + "metadata": { + "kernelspec": { + "name": "synapse_pyspark", + "display_name": "python" + }, + "language_info": { + "name": "python" + }, + "save_output": true, + "synapse_widget": { + "version": "0.1", + "state": {} } - }, - "outputs": [], - "source": [ - "from pyapacheatlas.core.typedef import AtlasAttributeDef, EntityTypeDef, RelationshipTypeDef\n", - "\n", - "try:\n", - " #-----------------------------------------------------------------------------------# \n", - " #create custom dataset type\n", - " type_df = EntityTypeDef(\n", - " name=\"custom_dataset\",\n", - " attributeDefs=[\n", - " AtlasAttributeDef(name=\"format\")\n", - " ],\n", - " superTypes = [\"DataSet\"]\n", - " )\n", - " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", - "\n", - " #-----------------------------------------------------------------------------------# \n", - " #create process with column mapping type\n", - " type_df = EntityTypeDef(\n", - " name=\"ProcessWithColumnMapping\",\n", - " attributeDefs=[\n", - " AtlasAttributeDef(name=\"columnMapping\")\n", - " ],\n", - " superTypes = [\"Process\"]\n", - " )\n", - " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", - "\n", - " #-----------------------------------------------------------------------------------# \n", - " #create AML workspace type\n", - " type_df = EntityTypeDef(\n", - " name=\"custom_ml_workspace\",\n", - " attributeDefs=[\n", - " AtlasAttributeDef(name='name',typename='string'),\n", - " AtlasAttributeDef(name='description',typename='string'),\n", - " AtlasAttributeDef(name='subscription_id',typename='string'),\n", - " AtlasAttributeDef(name='resource_group',typename='string')\n", - " ],\n", - " superTypes = [\"DataSet\"]\n", - " )\n", - " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", - " #-----------------------------------------------------------------------------------# \n", - " #create types for datastore and dataset\n", - "\n", - " #create AML datastore type\n", - " datastore_type_df = EntityTypeDef(\n", - " name=\"custom_ml_datastore\",\n", - " attributeDefs=[\n", - " AtlasAttributeDef(name=\"name\",typename='string'),\n", - " AtlasAttributeDef(name='container_name',typename='string'),\n", - " AtlasAttributeDef(name='account_name',typename='string'),\n", - " AtlasAttributeDef(name='protocol',typename='string'),\n", - " AtlasAttributeDef(name='endpoint',typename='string'),\n", - " AtlasAttributeDef(name='server_name',typename='string'),\n", - " AtlasAttributeDef(name='database_name',typename='string'),\n", - " AtlasAttributeDef(name=\"createdby\",typename='string')\n", - " ],\n", - " superTypes = [\"DataSet\"],\n", - " options = {\"schemaElementAttribute\":\"dataset\"}\n", - " )\n", - "\n", - " #create AML dataset type\n", - " dataset_type_df = EntityTypeDef(\n", - " name=\"custom_ml_dataset\",\n", - " attributeDefs=[\n", - " AtlasAttributeDef(name=\"name\",typename='string'),\n", - " AtlasAttributeDef(name=\"description\",typename='string'),\n", - " AtlasAttributeDef(name=\"createdby\",typename='string'),\n", - " AtlasAttributeDef(name=\"createdtime\",typename='string')\n", - " ],\n", - " superTypes = [\"DataSet\"]\n", - " )\n", - "\n", - " # create relationsip between datastore and dataset\n", - " dataset_to_datastore_relationship = RelationshipTypeDef(\n", - " name=\"custom_ml_datastore_to_dataset\",\n", - " relationshipCategory=\"COMPOSITION\",\n", - " endDef1={\n", - " \"type\": \"custom_ml_datastore\",\n", - " \"name\": \"dataset\",\n", - " \"isContainer\": True,\n", - " \"cardinality\": \"SET\",\n", - " \"isLegacyAttribute\": False\n", - " },\n", - " endDef2={\n", - " \"type\": \"custom_ml_dataset\",\n", - " \"name\": \"datastore\",\n", - " \"isContainer\": False,\n", - " \"cardinality\": \"SINGLE\",\n", - " \"isLegacyAttribute\": False\n", - " }\n", - " )\n", - "\n", - " typedef_results = client.upload_typedefs(\n", - " entityDefs = [datastore_type_df, dataset_type_df],\n", - " relationshipDefs = [dataset_to_datastore_relationship],\n", - " force_update=True\n", - " )\n", - " #-----------------------------------------------------------------------------------# \n", - " #create types for experiment and experimentstep\n", - " \n", - " #create process for Ml Experiment Step\n", - " exp_type_df = EntityTypeDef(\n", - " name=\"custom_ml_experiment\",\n", - " attributeDefs=[\n", - " AtlasAttributeDef(name='name',typename='string'),\n", - " AtlasAttributeDef(name='notes',typename='string'),\n", - " AtlasAttributeDef(name=\"createdby\",typename='string'),\n", - " AtlasAttributeDef(name=\"createdtime\",typename='string')\n", - " ],\n", - " superTypes = [\"Process\"]\n", - " )\n", - "\n", - " #create process for Ml Experiment Step\n", - " exp_step_type_df = EntityTypeDef(\n", - " name=\"custom_ml_experiment_step\",\n", - " attributeDefs=[\n", - " AtlasAttributeDef(name='notes',typename='string')\n", - " ],\n", - " superTypes = [\"Process\"]\n", - " )\n", - "\n", - " # create relationsip between experiment and experimentstep\n", - " step_to_exp_relationship = RelationshipTypeDef(\n", - " name=\"custom_ml_experiment_to_experimentstep\",\n", - " relationshipCategory=\"COMPOSITION\",\n", - " endDef1={\n", - " \"type\": \"custom_ml_experiment\",\n", - " \"name\": \"experimentstep\",\n", - " \"isContainer\": True,\n", - " \"cardinality\": \"SET\",\n", - " \"isLegacyAttribute\": False\n", - " },\n", - " endDef2={\n", - " \"type\": \"custom_ml_experiment_step\",\n", - " \"name\": \"experiment\",\n", - " \"isContainer\": False,\n", - " \"cardinality\": \"SINGLE\",\n", - " \"isLegacyAttribute\": False\n", - " }\n", - " )\n", - "\n", - " typedef_results = client.upload_typedefs(\n", - " entityDefs = [exp_type_df, exp_step_type_df],\n", - " relationshipDefs = [step_to_exp_relationship],\n", - " force_update=True\n", - " )\n", - " #-----------------------------------------------------------------------------------# \n", - " \n", - " rd = RelationshipTypeDef(\n", - " name=\"custom_ml_workspace_datastore\",\n", - " attributeDefs=[],\n", - " relationshipCategory=\"COMPOSITION\", # Means the child can't exist without the parent\n", - " endDef1={ # endDef1 decribes what the parent will have as an attribute\n", - " \"type\":\"custom_ml_workspace\", # Type of the parent\n", - " \"name\":\"datastores\", # What the parent will have\n", - " \"isContainer\": True,\n", - " \"cardinality\":\"SET\", # This is related to the cardinality, in this case the parent Server will have a SET of Models.\n", - " \"isLegacyAttribute\":False\n", - " },\n", - " endDef2={ # endDef2 decribes what the child will have as an attribute\n", - " \"type\":\"custom_ml_datastore\", # Type of the child\n", - " \"name\":\"workspace\", # What the child will have\n", - " \"isContainer\":False,\n", - " \"cardinality\":\"SINGLE\",\n", - " \"isLegacyAttribute\":False\n", - " }\n", - " )\n", - " client.upload_typedefs(relationshipDefs=[rd])\n", - " \n", - " #-----------------------------------------------------------------------------------# \n", - " rd = RelationshipTypeDef(\n", - " name=\"custom_ml_workspace_experiment\",\n", - " attributeDefs=[],\n", - " relationshipCategory=\"COMPOSITION\", # Means the child can't exist without the parent\n", - " endDef1={ # endDef1 decribes what the parent will have as an attribute\n", - " \"type\":\"custom_ml_workspace\", # Type of the parent\n", - " \"name\":\"experiments\", # What the parent will have\n", - " \"isContainer\": True,\n", - " \"cardinality\":\"SET\", # This is related to the cardinality, in this case the parent Server will have a SET of Models.\n", - " \"isLegacyAttribute\":False\n", - " },\n", - " endDef2={ # endDef2 decribes what the child will have as an attribute\n", - " \"type\":\"custom_ml_experiment\", # Type of the child\n", - " \"name\":\"workspace\", # What the child will have\n", - " \"isContainer\":False,\n", - " \"cardinality\":\"SINGLE\",\n", - " \"isLegacyAttribute\":False\n", - " }\n", - " )\n", - " client.upload_typedefs(relationshipDefs=[rd])\n", - "\n", - " #-----------------------------------------------------------------------------------# \n", - " #create types for packages and package\n", - "\n", - " #create packages type\n", - " packages_type_df = EntityTypeDef(\n", - " name=\"custom_ml_packages\",\n", - " attributeDefs=[\n", - " AtlasAttributeDef(name='notes',typename='string')\n", - " ],\n", - " superTypes = [\"DataSet\"],\n", - " options = {\"schemaElementAttribute\":\"package\"}\n", - " )\n", - "\n", - " package_type_df = EntityTypeDef(\n", - " name=\"custom_ml_package\",\n", - " attributeDefs=[\n", - " AtlasAttributeDef(name='programming_language',typename='string'),\n", - " AtlasAttributeDef(name='package_name',typename='string'),\n", - " AtlasAttributeDef(name='version',typename='string'),\n", - " AtlasAttributeDef(name='notes',typename='string')\n", - " ],\n", - " superTypes = [\"DataSet\"]\n", - " )\n", - "\n", - " # create relationsip between packages and package\n", - " package_to_packages_relationship = RelationshipTypeDef(\n", - " name=\"custom_ml_packages_to_package\",\n", - " relationshipCategory=\"COMPOSITION\",\n", - " endDef1={\n", - " \"type\": \"custom_ml_packages\",\n", - " \"name\": \"package\",\n", - " \"isContainer\": True,\n", - " \"cardinality\": \"SET\",\n", - " \"isLegacyAttribute\": False\n", - " },\n", - " endDef2={\n", - " \"type\": \"custom_ml_package\",\n", - " \"name\": \"packages\",\n", - " \"isContainer\": False,\n", - " \"cardinality\": \"SINGLE\",\n", - " \"isLegacyAttribute\": False\n", - " }\n", - " )\n", - "\n", - " typedef_results = client.upload_typedefs(\n", - " entityDefs = [packages_type_df, package_type_df],\n", - " relationshipDefs = [package_to_packages_relationship],\n", - " force_update=True\n", - " )\n", - " #-----------------------------------------------------------------------------------# \n", - " \n", - " #create experiemnt config type\n", - " type_df = EntityTypeDef(\n", - " name=\"custom_ml_exp_config\",\n", - " attributeDefs=[\n", - " AtlasAttributeDef(name='task_type',typename='string'),\n", - " AtlasAttributeDef(name='enable_early_stopping',typename='bool'),\n", - " AtlasAttributeDef(name='experiment_timeout_minutes',typename='int'),\n", - " AtlasAttributeDef(name='primary_metric',typename='string'),\n", - " AtlasAttributeDef(name='compute_target',typename='string'),\n", - " AtlasAttributeDef(name='label_column_name',typename='string'),\n", - " AtlasAttributeDef(name='n_cross_validations',typename='int'),\n", - " AtlasAttributeDef(name='model_explainability',typename='bool')\n", - " ],\n", - " superTypes = [\"DataSet\"]\n", - " )\n", - " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", - " \n", - " #-----------------------------------------------------------------------------------# \n", - "\n", - " #create model metrics type\n", - " type_df = EntityTypeDef(\n", - " name=\"custom_ml_model_metrics\",\n", - " attributeDefs=[\n", - " AtlasAttributeDef(name='AUC',typename='float'),\n", - " AtlasAttributeDef(name='Accuracy',typename='float'),\n", - " AtlasAttributeDef(name='Precision',typename='float'),\n", - " AtlasAttributeDef(name='Recall',typename='float'),\n", - " AtlasAttributeDef(name='F1',typename='float')\n", - " ],\n", - " superTypes = [\"DataSet\"]\n", - " )\n", - " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", - " \n", - " #-----------------------------------------------------------------------------------# \n", - "\n", - " #create model type\n", - " type_df = EntityTypeDef(\n", - " name=\"custom_ml_model\",\n", - " attributeDefs=[\n", - " AtlasAttributeDef(name='workspace_name',typename='string'),\n", - " AtlasAttributeDef(name='workspace_subscription_id',typename='string'),\n", - " AtlasAttributeDef(name='workspace_resource_group',typename='string'),\n", - " AtlasAttributeDef(name='name',typename='string'),\n", - " AtlasAttributeDef(name='id',typename='string'),\n", - " AtlasAttributeDef(name='version',typename='string'),\n", - " AtlasAttributeDef(name='tags',typename='string'),\n", - " AtlasAttributeDef(name='properties',typename='string')\n", - " ],\n", - " superTypes = [\"DataSet\"]\n", - " )\n", - " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", - " \n", - " #-----------------------------------------------------------------------------------# \n", - "\n", - " #create endpoint type\n", - " type_df = EntityTypeDef(\n", - " name=\"custom_ml_model_endpoint\",\n", - " attributeDefs=[\n", - " AtlasAttributeDef(name='workspace_name',typename='string'),\n", - " AtlasAttributeDef(name='workspace_subscription_id',typename='string'),\n", - " AtlasAttributeDef(name='workspace_resource_group',typename='string'),\n", - " AtlasAttributeDef(name='name',typename='string'),\n", - " AtlasAttributeDef(name='image_id',typename='string'),\n", - " AtlasAttributeDef(name='compute_type',typename='string'),\n", - " AtlasAttributeDef(name='state',typename='string'),\n", - " AtlasAttributeDef(name='scoring_uri',typename='string'),\n", - " AtlasAttributeDef(name='tags',typename='string'),\n", - " AtlasAttributeDef(name='state',typename='string'),\n", - " AtlasAttributeDef(name='properties',typename='string'),\n", - " AtlasAttributeDef(name='created_by',typename='string'),\n", - " AtlasAttributeDef(name='sample_json',typename='string')\n", - " ],\n", - " superTypes = [\"DataSet\"]\n", - " )\n", - " typedef_results = client.upload_typedefs(entityDefs = [type_df], force_update=True)\n", - " \n", - " #-----------------------------------------------------------------------------------# \n", - "except:\n", - " print('types already created') " - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" }, - "save_output": true, - "synapse_widget": { - "state": {}, - "version": "0.1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/SynapseNotebooks/03_Create_ML_Lineage_Functions.ipynb b/SynapseNotebooks/03_Create_ML_Lineage_Functions.ipynb index f1ead89..efd5458 100644 --- a/SynapseNotebooks/03_Create_ML_Lineage_Functions.ipynb +++ b/SynapseNotebooks/03_Create_ML_Lineage_Functions.ipynb @@ -1,915 +1,920 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + "cells": [ + { + "cell_type": "markdown", + "source": [ + "Copyright (c) Microsoft. All rights reserved.\r\n", + "\r\n", + "Licensed under the MIT license." + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } }, - "nteract": { - "transient": { - "deleting": false - } + { + "cell_type": "code", + "source": [ + "def get_entity_details(qualifiedName,typeName):\n", + " entities = client.get_entity(\n", + " qualifiedName=[qualifiedName],\n", + " typeName=typeName\n", + " )\n", + " for entity in entities.get(\"entities\"):\n", + " entity = entity\n", + " break\n", + " return entity\n", + "#get_entity_details('https://sampledataadls.dfs.core.windows.net/masterdata/employees.csv','azure_datalake_gen2_path')\n", + "\n", + "def get_entity_guid(qualifiedName,typeName):\n", + " entities = client.get_entity(\n", + " qualifiedName=[qualifiedName],\n", + " typeName=typeName\n", + " )\n", + " for entity in entities.get(\"entities\"):\n", + " entity_guid = entity.get(\"guid\")\n", + " break\n", + " return entity_guid\n", + "#get_entity_guid('https://sampledataadls.dfs.core.windows.net/creditriskdata/borrower.csv','azure_datalake_gen2_path')\n", + "\n", + "def get_entity_schema(guid):\n", + " columns = []\n", + " results = client.get_entity(guid)\n", + " for entity in results[\"entities\"]:\n", + " if \"tabular_schema\" in entity[\"relationshipAttributes\"]:\n", + " ts = entity[\"relationshipAttributes\"][\"tabular_schema\"]\n", + " ts_entity = client.get_entity(ts[\"guid\"])\n", + " for schema in ts_entity[\"entities\"]:\n", + " for col in schema[\"relationshipAttributes\"][\"columns\"]:\n", + " if col['displayText'] != ':csv':\n", + " columns.append(col['displayText'])\n", + " return(columns)\n", + " \n", + "# ent_guid = 'a8698a33-9174-43cb-8835-26968862e2bf'\n", + "# get_entity_schema(ent_guid)\n", + "\n", + "def create_data_entity_with_schema_and_parent(df_data,entityname,entitytype='custom_ml_dataset',parent_entityname=None,parent_entitytype='custom_ml_datastore'):\n", + " # Create an asset for the output data schema.\n", + " output_schema_entity = AtlasEntity(\n", + " name=\"schema-\" + entityname,\n", + " qualified_name = \"pyapacheatlas://\"+\"schema-\" + entityname,\n", + " typeName=\"tabular_schema\",\n", + " guid=guid.get_guid()\n", + " )\n", + "\n", + " df_data_schema = pd.DataFrame(list(zip(list(df_data.columns), list(df_data.dtypes))),columns=['column','dtype'])\n", + "\n", + " #Iterate over the out data frame's columns and create entities\n", + " output_entity_schema_columns = []\n", + " #for column in df.schema:\n", + " for index, row in df_data_schema.iterrows(): \n", + " temp_column = AtlasEntity(\n", + " name = row.column,\n", + " typeName = \"column\",\n", + " qualified_name = \"pyapacheatlas://schema-\" + entityname + \"#\" + row.column,\n", + " guid=guid.get_guid(),\n", + " attributes = {\"type\":str(row.dtype),\"description\": row.column},\n", + " relationshipAttributes = {\"composeSchema\":output_schema_entity.to_json(minimum=True)}\n", + " )\n", + " output_entity_schema_columns.append(temp_column)\n", + "\n", + "\n", + " if parent_entityname:\n", + " dstore_entity = get_entity_details(\"pyapacheatlas://\"+parent_entityname, parent_entitytype)\n", + " # Create a entity for dataset \n", + " dataset_output_entity = AtlasEntity(\n", + " name=entityname,\n", + " typeName=entitytype,\n", + " qualified_name=\"pyapacheatlas://\" + entityname,\n", + " guid = guid.get_guid(),\n", + " relationshipAttributes = {\n", + " \"tabular_schema\": output_schema_entity.to_json(minimum=True),\n", + " \"datastore\":dstore_entity\n", + " }\n", + " )\n", + " else:\n", + " # Create a entity for dataset \n", + " dataset_output_entity = AtlasEntity(\n", + " name=entityname,\n", + " typeName=entitytype,\n", + " qualified_name=\"pyapacheatlas://\" + entityname,\n", + " guid = guid.get_guid(),\n", + " relationshipAttributes = {\n", + " \"tabular_schema\": output_schema_entity.to_json(minimum=True)\n", + " }\n", + " )\n", + "\n", + " # Prepare all the entities as a batch to be uploaded.\n", + " batch = [dataset_output_entity, output_schema_entity] + output_entity_schema_columns\n", + " batch\n", + "\n", + " # Upload all entities!\n", + " client.upload_entities(batch=batch)\n", + " \n", + "def create_data_entity_with_schema(df_data,entityname,entitytype='custom_ml_dataset'):\n", + " # Create an asset for the output data schema.\n", + " output_schema_entity = AtlasEntity(\n", + " name=\"schema-\" + entityname,\n", + " qualified_name = \"pyapacheatlas://\"+\"schema-\" + entityname,\n", + " typeName=\"tabular_schema\",\n", + " guid=guid.get_guid()\n", + " )\n", + "\n", + " df_data_schema = pd.DataFrame(list(zip(list(df_data.columns), list(df_data.dtypes))),columns=['column','dtype'])\n", + "\n", + " #Iterate over the out data frame's columns and create entities\n", + " output_entity_schema_columns = []\n", + " #for column in df.schema:\n", + " for index, row in df_data_schema.iterrows(): \n", + " temp_column = AtlasEntity(\n", + " name = row.column,\n", + " typeName = \"column\",\n", + " qualified_name = \"pyapacheatlas://schema-\" + entityname + \"#\" + row.column,\n", + " guid=guid.get_guid(),\n", + " attributes = {\"type\":str(row.dtype),\"description\": row.column},\n", + " relationshipAttributes = {\"composeSchema\":output_schema_entity.to_json(minimum=True)}\n", + " )\n", + " output_entity_schema_columns.append(temp_column)\n", + "\n", + " # Create a entity for dataset \n", + " dataset_output_entity = AtlasEntity(\n", + " name=entityname,\n", + " typeName=entitytype,\n", + " qualified_name=\"pyapacheatlas://\" + entityname,\n", + " guid = guid.get_guid(),\n", + " relationshipAttributes = {\n", + " \"tabular_schema\": output_schema_entity.to_json(minimum=True)\n", + " }\n", + " )\n", + "\n", + " # Prepare all the entities as a batch to be uploaded.\n", + " batch = [dataset_output_entity, output_schema_entity] + output_entity_schema_columns\n", + " batch\n", + "\n", + " # Upload all entities!\n", + " client.upload_entities(batch=batch)\n", + " \n", + "def create_lineage_for_entities(experimentname,processname,in_ent_qns,out_ent_qns,process_type_name='Process',ColumnMapping=False):\n", + " # create a process \n", + " # inputs: list of (entity,type) tuples\n", + " # outputs: list of (entity,type) tuples\n", + "\n", + " from pyapacheatlas.core import AtlasProcess\n", + "\n", + " in_ent_guids = []\n", + " for in_ent_qn in in_ent_qns:\n", + " #print(in_ent_qn,in_ent_qns[in_ent_qn])\n", + " in_ent_guid = get_entity_guid(in_ent_qn,in_ent_qns[in_ent_qn])\n", + " in_ent_guids.append({'guid':in_ent_guid})\n", + " \n", + " out_ent_guids = []\n", + " for out_ent_qn in out_ent_qns:\n", + " #print(in_ent_qn,in_ent_qns[in_ent_qn])\n", + " out_ent_guid = get_entity_guid(out_ent_qn,out_ent_qns[out_ent_qn])\n", + " out_ent_guids.append({'guid':out_ent_guid})\n", + "\n", + " process_name = experimentname + processname\n", + " process_qn = \"pyapacheatlas://\" + process_name\n", + "\n", + " if ColumnMapping == False:\n", + " process_type_name = process_type_name\n", + "\n", + " process = AtlasProcess(\n", + " name=process_name,\n", + " typeName=process_type_name,\n", + " qualified_name=process_qn,\n", + " inputs = in_ent_guids,\n", + " outputs = out_ent_guids,\n", + " guid=guid.get_guid()\n", + " )\n", + " else:\n", + " process_type_name = \"ProcessWithColumnMapping\"\n", + "\n", + " column_mapping_attributes = []\n", + " for in_ent_qn in in_ent_qns:\n", + " cl_mapping = []\n", + " in_ent_columns = get_entity_schema(get_entity_guid(in_ent_qn,in_ent_qns[in_ent_qn]))\n", + " for in_col in in_ent_columns:\n", + " cl_mapping.append({\"Source\":in_col,\"Sink\":in_col})\n", + " #break\n", + " mapping = {\n", + " 'DatasetMapping': {'Source':in_ent_qn,'Sink':list(out_ent_qns.keys())[0]},\n", + " 'ColumnMapping': cl_mapping\n", + " }\n", + " column_mapping_attributes.append(mapping)\n", + "\n", + " process = AtlasProcess(\n", + " name=process_name,\n", + " typeName=process_type_name,\n", + " qualified_name=process_qn,\n", + " inputs = in_ent_guids,\n", + " outputs = out_ent_guids,\n", + " guid=guid.get_guid(),\n", + " attributes={\"columnMapping\":json.dumps(column_mapping_attributes)}\n", + " )\n", + "\n", + " # Prepare all the entities as a batch to be uploaded.\n", + " batch = [process]\n", + " batch\n", + "\n", + " # Upload all entities!\n", + " client.upload_entities(batch=batch)\n", + " \n", + "def create_entity(name,typeName,config_attibutes):\n", + " # Create an entity\n", + " name = name \n", + " qn = \"pyapacheatlas://\" + name\n", + "\n", + " exp_config_entity = AtlasEntity(\n", + " name=name,\n", + " typeName=typeName,\n", + " qualified_name=qn,\n", + " guid = guid.get_guid(),\n", + " attributes = config_attibutes\n", + " )\n", + "\n", + " # Upload all entities!\n", + " client.upload_entities(batch=[exp_config_entity.to_json()])\n", + "\n", + " \n", + "def get_dataset_details(indataset,experiment_name=''):\n", + " result = []\n", + " #print(indataset)\n", + " if 'FileDataset' in str(type((indataset))):\n", + " dssource = eval(json.loads(str(indataset).replace('FileDataset',''))['source'][0])\n", + " sourcestore = dssource[0]\n", + " sourcepath = dssource[1]\n", + " sourcepathfiles = indataset.to_path()\n", + " for sourcepathfile in sourcepathfiles:\n", + " entityname = sourcepath.split('/')[-1] + sourcepathfile.replace('/','_') #.replace('.parquet','').replace('.csv','')\n", + " #print('\\nFileDataset:',entityname)\n", + "\n", + " dsdatastore = Datastore.get(ws, sourcestore)\n", + " datastore_path = [DataPath(dsdatastore, sourcepath+sourcepathfile.replace('/',''))]\n", + " \n", + " if '.parquet' in sourcepathfile:\n", + " tabular_dataset = Dataset.Tabular.from_parquet_files(path=datastore_path)\n", + " df_data = tabular_dataset.take(10).to_pandas_dataframe()\n", + " \n", + " elif '.csv' in sourcepathfile:\n", + " tabular_dataset = Dataset.Tabular.from_delimited_files(path=datastore_path,encoding ='iso88591') \n", + " #'utf8', 'iso88591', 'latin1', 'ascii', 'utf16', 'utf32', 'utf8bom' and 'windows1252'\n", + " df_data = tabular_dataset.take(10).to_pandas_dataframe()\n", + " \n", + " if experiment_name != '':\n", + " result.append((entityname + '_' + experiment_name,df_data))\n", + " else:\n", + " result.append((entityname,df_data))\n", + "\n", + " elif 'TabularDataset' in str(type((indataset))):\n", + " tabular_dataset = indataset\n", + " entityname = json.loads(str(indataset).replace('TabularDataset',''))['registration']['name']\n", + " \n", + " # dataset = Dataset.get_by_name(ws, name=entityname)\n", + " # try:\n", + " # sourcestore = json.loads(dataset._definition)['blocks'][0]['arguments']['datastore']['datastoreName']\n", + " # except:\n", + " # sourcestore = json.loads(dataset._definition)['blocks'][0]['arguments']['datastores'][0]['datastoreName']\n", + " df_data = tabular_dataset.take(10).to_pandas_dataframe()\n", + " #print('TabularDataset:', entityname)\n", + " result.append((entityname,df_data))\n", + " return result\n", + "\n", + "\n", + "from azureml.core import Experiment\n", + "from azureml.pipeline.core import PipelineRun\n", + "\n", + "from azureml.core import Workspace, Datastore, Dataset\n", + "from azureml.data.datapath import DataPath\n", + "import json \n", + "import pandas as pd\n", + "\n", + "def create_aml_experiment_steps(ws,experiment_name):\n", + " experiments_lst = Experiment.list(ws)\n", + " for experiment in experiments_lst:\n", + " if experiment.name == experiment_name:\n", + " #print(experiment)\n", + " exp = Experiment(ws,experiment.name)\n", + " for run in exp.get_runs(): \n", + " rundetails = run.get_details()\n", + " #print(rundetails)\n", + " if rundetails['status'] != 'Completed': #continue until we find a completed run \n", + " continue\n", + " pipeline_run = PipelineRun(exp, rundetails['runId'])\n", + "\n", + " steps = pipeline_run.get_steps()\n", + " for step_run in steps:\n", + " step_run_details = step_run.get_details_with_logs()\n", + " #print(step_run_details)\n", + " #print(step_run_details['runDefinition']['script'])\n", + "\n", + " purview_basepath = 'pyapacheatlas://'\n", + " in_ent_qns = {}\n", + " out_ent_qns = {}\n", + " #print(step_run_details)\n", + " step_name = step_run.name #step_run_details['runDefinition']['script']\n", + " #print(step_name)\n", + " \n", + " #print('\\n Input Datasets:\\n')\n", + " for indataset in step_run_details['inputDatasets']:\n", + " in_result = get_dataset_details(indataset['dataset'],experiment_name)\n", + " #print(in_result)\n", + " #create entities \n", + " for in_res in in_result:\n", + " data_ent_name = in_res[0].strip('_')\n", + " create_data_entity_with_schema(in_res[1],data_ent_name,'custom_ml_dataset')\n", + " in_ent_qns[purview_basepath + data_ent_name] = 'custom_ml_dataset'\n", + " #break\n", + " #print('\\n Output Datasets:\\n')\n", + " for outdataset in step_run_details['outputDatasets']:\n", + " out_result = get_dataset_details(outdataset['dataset'],experiment_name)\n", + " #print(out_result)\n", + " #create entities\n", + " for out_res in out_result:\n", + " data_ent_name = out_res[0].strip('_')\n", + " create_data_entity_with_schema(out_res[1],data_ent_name,'custom_ml_dataset')\n", + " out_ent_qns[purview_basepath + data_ent_name] = 'custom_ml_dataset'\n", + " #break\n", + " #print(in_ent_qns,out_ent_qns)\n", + " create_lineage_for_entities(experiment_name + '_',step_name, in_ent_qns,out_ent_qns,process_type_name='custom_ml_experiment_step',ColumnMapping=False)\n", + " #break \n", + " \n", + " break # break after processing one completed run\n", + " break #after finding the experiment\n", + "\n", + "\n", + "#create workspace entity\n", + "def create_workspace_entities(ws):\n", + "\n", + " config_attibutes={}\n", + " temp_column={}\n", + "\n", + " temp_column['name'] = ws.name\n", + " config_attibutes.update(temp_column)\n", + " temp_column['subscription_id'] = ws.subscription_id\n", + " config_attibutes.update(temp_column)\n", + " temp_column['resource_group'] = ws.resource_group\n", + " config_attibutes.update(temp_column)\n", + "\n", + " create_entity(ws.name,'custom_ml_workspace',config_attibutes)\n", + " #break\n", + "\n", + "\n", + "#create all datastore entities\n", + "def create_datastore_entities(ws):\n", + " for datastore in ws.datastores.values():\n", + " config_attibutes={}\n", + " temp_column={}\n", + " \n", + " temp_column['name'] = datastore.name\n", + " config_attibutes.update(temp_column)\n", + "\n", + " if ('AzureDataLakeGen2Datastore' in str(type(datastore))) or ('AzureBlobDatastore' in str(type(datastore))):\n", + " temp_column['container_name'] = datastore.container_name\n", + " config_attibutes.update(temp_column)\n", + " temp_column['account_name'] = datastore.account_name\n", + " config_attibutes.update(temp_column)\n", + " temp_column['protocol'] = datastore.protocol\n", + " config_attibutes.update(temp_column)\n", + " temp_column['endpoint'] = datastore.endpoint\n", + " config_attibutes.update(temp_column)\n", + " elif 'AzureSqlDatabaseDatastore' in str(type(datastore)):\n", + " #print('sql',datastore.server_name)\n", + " temp_column['server_name'] = datastore.server_name\n", + " config_attibutes.update(temp_column)\n", + " temp_column['database_name'] = datastore.database_name\n", + " config_attibutes.update(temp_column)\n", + " elif 'AzureBlobDatastore' in str(type(datastore)): \n", + " pass\n", + "\n", + " create_entity(datastore.name,'custom_ml_datastore',config_attibutes)\n", + " #break\n", + "\n", + " #create workspace and datastore relationship\n", + " purview_basepath = 'pyapacheatlas://'\n", + " for datastore in ws.datastores.values():\n", + " relationshiptype = 'custom_ml_workspace_datastore'\n", + " end1type = 'custom_ml_workspace'\n", + " end2type = 'custom_ml_datastore'\n", + " end1_qn = purview_basepath + ws.name\n", + " end2_qn = purview_basepath + datastore.name\n", + " try:\n", + " create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn)\n", + " except:\n", + " pass # ignore if relationship exists\n", + "\n", + "#create all dataset entities (with datastore as parent)\n", + "from azureml.core import Workspace, Datastore, Dataset\n", + "import pandas as pd\n", + "def create_dataset_entities(ws,parent_flag=True):\n", + " purview_basepath = 'pyapacheatlas://'\n", + " for dsname in ws.datasets:\n", + " dataset = ws.datasets[dsname]\n", + " try:\n", + " if 'FileDataset' in str(type((dataset))):\n", + " datasetsource = eval(json.loads(str(dataset).replace('FileDataset',''))['source'][0])[0]\n", + " elif 'TabularDataset' in str(type((dataset))):\n", + " datasetsource = eval(json.loads(str(dataset).replace('TabularDataset',''))['source'][0])[0]\n", + " dsdetails = get_dataset_details(dataset)\n", + " #print(dsdetails)\n", + " for ds in dsdetails:\n", + " if parent_flag == False:\n", + " create_data_entity_with_schema(ds[1],dsname,'custom_ml_dataset')\n", + " create_lineage_for_entities('',('register_' + dsname), {(purview_basepath+datasetsource):'custom_ml_datastore'},\n", + " {(purview_basepath+ds[0]):'custom_ml_dataset'},ColumnMapping=False)\n", + " else: \n", + " create_data_entity_with_schema_and_parent(ds[1],dsname,entitytype='custom_ml_dataset',\n", + " parent_entityname=datasetsource,parent_entitytype='custom_ml_datastore') \n", + " except:\n", + " print('Error:',dsname) \n", + " #break\n", + " \n", + " \n", + "#create experiment entity\n", + "from azureml.core import Experiment\n", + "\n", + "def create_experiment_entities(ws):\n", + " for experiment in Experiment.list(ws):\n", + " #create experiment entity\n", + " config_attibutes={}\n", + " temp_column={}\n", + "\n", + " temp_column['name'] = experiment.name\n", + " config_attibutes.update(temp_column)\n", + "\n", + " create_entity(experiment.name,'custom_ml_experiment',config_attibutes)\n", + " #break\n", + " \n", + " purview_basepath = 'pyapacheatlas://'\n", + "\n", + " #create experiment relationship to workspace\n", + " relationshiptype = 'custom_ml_workspace_experiment'\n", + " end1type = 'custom_ml_workspace'\n", + " end2type = 'custom_ml_experiment'\n", + " end1_qn = purview_basepath + ws.name\n", + " end2_qn = purview_basepath + experiment.name\n", + " try:\n", + " create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn)\n", + " except:\n", + " pass # ignore if relationship exists\n", + " \n", + " for run in experiment.get_runs(): \n", + " rundetails = run.get_details()\n", + " #print(rundetails)\n", + " if rundetails['status'] != 'Completed': #continue until we find a completed run \n", + " continue\n", + " # print(rundetails['properties']['azureml.runsource'])\n", + " #create experiment steps\n", + " if rundetails['properties']['azureml.runsource'] == 'azureml.PipelineRun':\n", + " print(experiment.name)\n", + " create_aml_experiment_steps(ws,experiment.name)\n", + "\n", + " pipeline_run = PipelineRun(experiment, rundetails['runId'])\n", + "\n", + " steps = pipeline_run.get_steps()\n", + " for step_run in steps:\n", + " #print(experiment.name + '_' + step_run.name)\n", + " \n", + " #create experiment relationship to workspace\n", + " relationshiptype = 'custom_ml_experiment_to_experimentstep'\n", + " end1type = 'custom_ml_experiment'\n", + " end2type = 'custom_ml_experiment_step'\n", + " end1_qn = purview_basepath + experiment.name\n", + " end2_qn = purview_basepath + experiment.name + '_' + step_run.name\n", + " try:\n", + " create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn)\n", + " except:\n", + " pass # ignore if relationship exists\n", + "\n", + " break # break after processing one completed run\n", + " #break\n", + "\n", + "def create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn):\n", + " relationship = {}\n", + " end1 = {}\n", + " end2 = {}\n", + "\n", + " end1[\"guid\"] = get_entity_guid(end1_qn,end1type)\n", + " end1[\"typeName\"] = end1type\n", + " end1[\"uniqueAttributes\"] = {\"qualifiedName\": end1_qn}\n", + "\n", + " end2[\"guid\"] = get_entity_guid(end2_qn,end2type)\n", + " end2[\"typeName\"] = end2type\n", + " end2[\"uniqueAttributes\"] = {\"qualifiedName\": end2_qn}\n", + "\n", + " relationship[\"typeName\"] = relationshiptype\n", + " relationship[\"attributes\"] = {}\n", + " relationship[\"guid\"] = guid.get_guid()\n", + " relationship[\"provenanceType\"] = 0\n", + " relationship[\"end1\"] = end1\n", + " relationship[\"end2\"] = end2\n", + " relationship\n", + " \n", + " client.upload_relationship(relationship) \n", + " \n", + "def create_package_entities(experimentname,packageslist):\n", + " packages_name = experimentname + '-packages' \n", + " packages_qn = \"pyapacheatlas://\" + packages_name\n", + "\n", + " # Create an asset for the packages.\n", + " packages_entity = AtlasEntity(\n", + " name = packages_name,\n", + " qualified_name = packages_qn,\n", + " typeName=\"custom_ml_packages\",\n", + " attributes = {\"notes\":\"test note\"},\n", + " guid=guid.get_guid()\n", + " )\n", + "\n", + " packages_entity.to_json(minimum=True)\n", + "\n", + " atlas_packages = []\n", + " relationships = []\n", + " for package in packageslist:\n", + " package_attibutes={}\n", + " temp_column={}\n", + " temp_column['programming_language'] = str(package[0])\n", + " package_attibutes.update(temp_column)\n", + " temp_column['package_name'] = str(package[1])\n", + " package_attibutes.update(temp_column)\n", + " temp_column['version'] = str(package[2])\n", + " package_attibutes.update(temp_column)\n", + " temp_column['notes'] = str(package[3])\n", + " package_attibutes.update(temp_column)\n", + "\n", + " # Create an entity for each package\n", + " name = str(package[1]) #experimentname + '-package-' + package[1] \n", + " qn = packages_qn + '#' + str(package[1]) #\"pyapacheatlas://\" + name\n", + "\n", + " package_entity = AtlasEntity(\n", + " name= name,\n", + " typeName=\"custom_ml_package\",\n", + " qualified_name=qn,\n", + " guid = guid.get_guid(),\n", + " attributes = package_attibutes,\n", + " relationshipAttributes = {\"packages\":packages_entity.to_json(minimum=True)}\n", + " )\n", + " atlas_packages.append(package_entity)\n", + "\n", + " atlas_packages\n", + "\n", + " # Prepare all the entities as a batch to be uploaded.\n", + " batch = [packages_entity] + atlas_packages\n", + " client.upload_entities(batch=batch) \n", + " \n", + "def create_experiment_config_entity(ws,experiment_name,automl_run):\n", + " # Get experiment config from AML run\n", + " import json\n", + " import pandas as pd\n", + " run_properties = automl_run.get_properties()\n", + " run_properties\n", + "\n", + " AMLSettingsJsonString = run_properties['AMLSettingsJsonString']\n", + " AMLSettings = json.loads(AMLSettingsJsonString)\n", + "\n", + " df_config = pd.DataFrame(list(AMLSettings.items()),columns = ['key','value']) \n", + "\n", + " keys = ['task_type','enable_early_stopping','experiment_timeout_minutes','primary_metric','compute_target','label_column_name','n_cross_validations','model_explainability']\n", + "\n", + " df_config = df_config[df_config['key'].isin(keys)]\n", + "\n", + " dict_config = df_config.to_dict(orient = 'records')\n", + " dict_config\n", + "\n", + " config_attibutes={}\n", + " for attibutes in dict_config:\n", + " temp_column={}\n", + " temp_column[attibutes['key']] = attibutes['value']\n", + " config_attibutes.update(temp_column)\n", + " config_attibutes\n", + "\n", + " # Create a entity for exp config \n", + " name = experiment_name + \"-config\"\n", + " qn = \"pyapacheatlas://\" + name\n", + "\n", + " exp_config_entity = AtlasEntity(\n", + " name=name,\n", + " typeName=\"custom_ml_exp_config\",\n", + " qualified_name=qn,\n", + " guid = guid.get_guid(),\n", + " attributes = config_attibutes\n", + " )\n", + "\n", + " # Upload all entities!\n", + " client.upload_entities(batch=[exp_config_entity.to_json()])\n", + " \n", + "def create_model_entity(ws,experiment_name,modelname):\n", + " # get deployed model\n", + " from azureml.core.model import Model\n", + " model = Model(ws, modelname)\n", + "\n", + " config_attibutes={}\n", + " temp_column={}\n", + " temp_column['workspace_name'] = model.workspace.name\n", + " config_attibutes.update(temp_column)\n", + " temp_column['workspace_subscription_id'] = model.workspace.subscription_id\n", + " config_attibutes.update(temp_column)\n", + " temp_column['workspace_subscription_id'] = model.workspace.subscription_id\n", + " config_attibutes.update(temp_column)\n", + " temp_column['workspace_resource_group'] = model.workspace.resource_group\n", + " config_attibutes.update(temp_column)\n", + " temp_column['name'] = model.name\n", + " config_attibutes.update(temp_column)\n", + " temp_column['id'] = model.id\n", + " config_attibutes.update(temp_column)\n", + " temp_column['version'] = model.version\n", + " config_attibutes.update(temp_column)\n", + " temp_column['tags'] = model.tags\n", + " config_attibutes.update(temp_column)\n", + " temp_column['properties'] = model.properties\n", + " config_attibutes.update(temp_column)\n", + "\n", + " # Create a entity for Model\n", + " name = modelname \n", + " qn = \"pyapacheatlas://\" + name\n", + "\n", + " exp_config_entity = AtlasEntity(\n", + " name=name,\n", + " typeName=\"custom_ml_model\",\n", + " qualified_name=qn,\n", + " guid = guid.get_guid(),\n", + " attributes = config_attibutes\n", + " )\n", + "\n", + " # Upload all entities!\n", + " client.upload_entities(batch=[exp_config_entity.to_json()]) \n", + " \n", + "def create_model_metrics_entity(experiment_name,best_run):\n", + " metrics = best_run.get_metrics()\n", + "\n", + " # select relevant metrics\n", + " auc = metrics.get('AUC_weighted')\n", + " accuracy = metrics.get('accuracy')\n", + " precision = metrics.get('precision_score_weighted')\n", + " recall = metrics.get('recall_score_weighted')\n", + " f1 = metrics.get('f1_score_weighted')\n", + "\n", + " # # combine into single dataframe\n", + " # metrics_df = sc.parallelize([['AUC', auc], ['Accuracy', accuracy], ['Precision', precision], ['Recall', recall], ['F1', f1]]).toDF(('Metric', 'Value'))\n", + " metrics = ['AUC','Accuracy','Precision','Recall','F1']\n", + " metricslist= [auc,accuracy,precision,recall,f1]\n", + " columns = ['Metric','Value']\n", + " metrics_df = pd.DataFrame(zip(metrics, metricslist),columns=columns)\n", + "\n", + "\n", + " dict_metrics = metrics_df.to_dict(orient = 'records')\n", + " dict_metrics\n", + "\n", + " config_attibutes={}\n", + " for attibutes in dict_metrics:\n", + " temp_column={}\n", + " temp_column[attibutes['Metric']] = attibutes['Value']\n", + " config_attibutes.update(temp_column)\n", + " config_attibutes\n", + "\n", + " name = experiment_name + \"-modelmetrics\"\n", + " qn = \"pyapacheatlas://\" + name\n", + "\n", + " # Create a entity for model metrics\n", + " exp_config_entity = AtlasEntity(\n", + " name=name,\n", + " typeName=\"custom_ml_model_metrics\",\n", + " qualified_name=qn,\n", + " guid = guid.get_guid(),\n", + " attributes = config_attibutes\n", + " )\n", + "\n", + " # Upload all entities!\n", + " client.upload_entities(batch=[exp_config_entity.to_json()])\n", + " \n", + "def create_experiment_lineage(experimentname,exp_data_qn,exp_config_qn,model_metrics_qn,model_qn): \n", + " # create experiment process \n", + " # inputs: prepareddata, modelconfig \n", + " # outputs: model metrics and registered model\n", + "\n", + " from pyapacheatlas.core import AtlasProcess\n", + "\n", + " in_data_ent_guid = get_entity_guid(exp_data_qn,'custom_dataset')\n", + " in_exp_config_guid = get_entity_guid(exp_config_qn,'custom_ml_exp_config')\n", + " out_model_metrics_guid = get_entity_guid(model_metrics_qn,'custom_ml_model_metrics')\n", + " out_model_guid = get_entity_guid(model_qn,'custom_ml_model')\n", + "\n", + " process_name = experimentname + '-train'\n", + " process_qn = \"pyapacheatlas://\" + process_name\n", + " process_type_name = \"Process\"\n", + "\n", + " process = AtlasProcess(\n", + " name=process_name,\n", + " typeName=process_type_name,\n", + " qualified_name=process_qn,\n", + " inputs = [{\"guid\":in_data_ent_guid},{\"guid\":in_exp_config_guid}],\n", + " outputs = [{\"guid\":out_model_metrics_guid},{\"guid\":out_model_guid}],\n", + " guid=guid.get_guid()\n", + " )\n", + "\n", + " # Prepare all the entities as a batch to be uploaded.\n", + " batch = [process]\n", + " batch\n", + "\n", + " # Upload all entities!\n", + " client.upload_entities(batch=batch) \n", + " \n", + "def create_model_service_entity(ws,experimentname,aci_service_name,samplejson):\n", + " # get deployed ACI Web Service\n", + " from azureml.core.webservice import AciWebservice\n", + " aciws = AciWebservice(ws, aci_service_name)\n", + "\n", + " config_attibutes={}\n", + " temp_column={}\n", + " temp_column['workspace_name'] = aciws.workspace.name\n", + " config_attibutes.update(temp_column)\n", + " temp_column['workspace_subscription_id'] = aciws.workspace.subscription_id\n", + " config_attibutes.update(temp_column)\n", + " temp_column['workspace_resource_group'] = aciws.workspace.resource_group\n", + " config_attibutes.update(temp_column)\n", + " temp_column['name'] = aciws.name\n", + " config_attibutes.update(temp_column)\n", + " temp_column['image_id'] = aciws.image_id\n", + " config_attibutes.update(temp_column)\n", + " temp_column['compute_type'] = aciws.compute_type\n", + " config_attibutes.update(temp_column)\n", + " temp_column['state'] = aciws.state\n", + " config_attibutes.update(temp_column)\n", + " temp_column['scoring_uri'] = aciws.scoring_uri\n", + " config_attibutes.update(temp_column)\n", + " temp_column['tags'] = aciws.tags\n", + " config_attibutes.update(temp_column)\n", + " temp_column['state'] = aciws.state\n", + " config_attibutes.update(temp_column)\n", + " temp_column['properties'] = aciws.properties\n", + " config_attibutes.update(temp_column)\n", + " temp_column['created_by'] = aciws.created_by\n", + " config_attibutes.update(temp_column)\n", + " temp_column['sample_json'] = samplejson\n", + " config_attibutes.update(temp_column)\n", + "\n", + " name = experimentname + \"-model_endpoint\"\n", + " qn = \"pyapacheatlas://\" + name\n", + "\n", + " # Create a entity for ACI Web Service\n", + " endpoint_entity = AtlasEntity(\n", + " name=name,\n", + " typeName=\"custom_ml_model_endpoint\",\n", + " qualified_name=qn,\n", + " guid = guid.get_guid(),\n", + " attributes = config_attibutes\n", + " )\n", + "\n", + " # Upload all entities!\n", + " client.upload_entities(batch=[endpoint_entity.to_json()]) \n", + " \n", + "def create_powerbi_dataset_and_lineage(experiment_name,pbi_workspace,pbi_datasetid,pbidata_ent_name,ml_dataset_ent_name,ml_dataset_ent_type):\n", + " \n", + " pbidata_entity_type = 'powerbi_dataset'\n", + " pbidata_ent_qn = pbi_workspace + '/datasets/' + pbi_datasetid \n", + " purview_basepath = 'pyapacheatlas://'\n", + " #\"https://msit.powerbi.com/groups/7d666287-f9b8-45ff-be6c-9909afe9df40/datasets/e5a30c22-466d-4a30-a1ac-8736ed6567cc\"\n", + "\n", + " pbidata_ent = AtlasEntity(\n", + " name=pbidata_ent_name,\n", + " typeName=pbidata_entity_type,\n", + " qualified_name= pbidata_ent_qn,\n", + " workspace = pbi_workspace,\n", + " guid = guid.get_guid()\n", + " )\n", + "\n", + " # Prepare all the entities as a batch to be uploaded.\n", + " batch = [pbidata_ent]\n", + " batch\n", + "\n", + " # Upload all entities!\n", + " client.upload_entities(batch=batch)\n", + "\n", + " #cretae powerbi_dataset_process lineage\n", + " in_ent_guids = []\n", + " in_ent_guid = get_entity_guid(purview_basepath + ml_dataset_ent_name,ml_dataset_ent_type)\n", + " in_ent_guids.append({'guid':in_ent_guid})\n", + "\n", + " out_ent_guids = []\n", + " out_ent_guid = get_entity_guid(pbidata_ent_qn,pbidata_entity_type)\n", + " out_ent_guids.append({'guid':out_ent_guid})\n", + "\n", + " process_name = 'createpowerbidataset' + pbidata_ent_name + experiment_name\n", + " process_qn = \"pyapacheatlas://\" + process_name\n", + " process_type_name = \"powerbi_dataset_process\"\n", + "\n", + " process = AtlasProcess(\n", + " name=process_name,\n", + " typeName=process_type_name,\n", + " qualified_name=process_qn,\n", + " inputs = in_ent_guids,\n", + " outputs = out_ent_guids,\n", + " guid=guid.get_guid()\n", + " )\n", + "\n", + " # Prepare all the entities as a batch to be uploaded.\n", + " batch = [process]\n", + " batch\n", + "\n", + " # Upload all entities!\n", + " client.upload_entities(batch=batch)\n", + " \n", + "def create_powerbi_report_and_lineage(experiment_name,pbi_workspace,pbi_reportid,pbi_ent_name,pbi_datasetid):\n", + "\n", + " #create powerbi report\n", + " pbi_entity_type = 'powerbi_report'\n", + " pbi_ent_qn = pbi_workspace + '/reports/' + pbi_reportid \n", + " purview_basepath = 'pyapacheatlas://'\n", + " \n", + " pbi_ent = AtlasEntity(\n", + " name=pbi_ent_name,\n", + " typeName=pbi_entity_type,\n", + " qualified_name= pbi_ent_qn, \n", + " workspace = pbi_workspace,\n", + " guid = guid.get_guid()\n", + " )\n", + "\n", + " # Prepare all the entities as a batch to be uploaded.\n", + " batch = [pbi_ent]\n", + " batch\n", + "\n", + " # Upload all entities!\n", + " client.upload_entities(batch=batch)\n", + "\n", + " #create powerbi dashboard process lineage\n", + " pbidata_ent_qn = pbi_workspace + '/datasets/' + pbi_datasetid \n", + " in_ent_guids = []\n", + " in_ent_guid = get_entity_guid(pbidata_ent_qn,'powerbi_dataset')\n", + " in_ent_guids.append({'guid':in_ent_guid})\n", + "\n", + " out_ent_guids = []\n", + " out_ent_guid = get_entity_guid(pbi_ent_qn,'powerbi_report')\n", + " out_ent_guids.append({'guid':out_ent_guid})\n", + "\n", + " process_name = 'createpowerbireport' + pbi_ent_name + experiment_name\n", + " process_qn = \"pyapacheatlas://\" + process_name\n", + " process_type_name = \"powerbi_report_process\"\n", + "\n", + " process = AtlasProcess(\n", + " name=process_name,\n", + " typeName=process_type_name,\n", + " qualified_name=process_qn,\n", + " inputs = in_ent_guids,\n", + " outputs = out_ent_guids,\n", + " guid=guid.get_guid()\n", + " )\n", + "\n", + " # Prepare all the entities as a batch to be uploaded.\n", + " batch = [process]\n", + " batch\n", + "\n", + " # Upload all entities!\n", + " client.upload_entities(batch=batch)\n", + " \n", + "# clean up datasets\n", + "def cleanup_entities(typename, entitytype):\n", + " filter_setup = {\"typeName\": typename, \"includeSubTypes\": True}\n", + " search = client.search_entities(\"*\", search_filter=filter_setup)\n", + " for entity in search:\n", + " #print(entity)\n", + " if entity.get(\"entityType\") == entitytype:\n", + " print(entity.get(\"id\"),entity.get(\"qualifiedName\"),entity.get(\"entityType\"))\n", + " guid = entity.get(\"id\")\n", + " client.delete_entity(guid=guid)\n", + "\n" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + } + ], + "metadata": { + "kernelspec": { + "name": "synapse_pyspark", + "display_name": "python" + }, + "language_info": { + "name": "python" + }, + "save_output": true, + "synapse_widget": { + "version": "0.1", + "state": {} } - }, - "outputs": [], - "source": [ - "def get_entity_details(qualifiedName,typeName):\n", - " entities = client.get_entity(\n", - " qualifiedName=[qualifiedName],\n", - " typeName=typeName\n", - " )\n", - " for entity in entities.get(\"entities\"):\n", - " entity = entity\n", - " break\n", - " return entity\n", - "#get_entity_details('https://sampledataadls.dfs.core.windows.net/masterdata/employees.csv','azure_datalake_gen2_path')\n", - "\n", - "def get_entity_guid(qualifiedName,typeName):\n", - " entities = client.get_entity(\n", - " qualifiedName=[qualifiedName],\n", - " typeName=typeName\n", - " )\n", - " for entity in entities.get(\"entities\"):\n", - " entity_guid = entity.get(\"guid\")\n", - " break\n", - " return entity_guid\n", - "#get_entity_guid('https://sampledataadls.dfs.core.windows.net/creditriskdata/borrower.csv','azure_datalake_gen2_path')\n", - "\n", - "def get_entity_schema(guid):\n", - " columns = []\n", - " results = client.get_entity(guid)\n", - " for entity in results[\"entities\"]:\n", - " if \"tabular_schema\" in entity[\"relationshipAttributes\"]:\n", - " ts = entity[\"relationshipAttributes\"][\"tabular_schema\"]\n", - " ts_entity = client.get_entity(ts[\"guid\"])\n", - " for schema in ts_entity[\"entities\"]:\n", - " for col in schema[\"relationshipAttributes\"][\"columns\"]:\n", - " if col['displayText'] != ':csv':\n", - " columns.append(col['displayText'])\n", - " return(columns)\n", - " \n", - "# ent_guid = 'a8698a33-9174-43cb-8835-26968862e2bf'\n", - "# get_entity_schema(ent_guid)\n", - "\n", - "def create_data_entity_with_schema_and_parent(df_data,entityname,entitytype='custom_ml_dataset',parent_entityname=None,parent_entitytype='custom_ml_datastore'):\n", - " # Create an asset for the output data schema.\n", - " output_schema_entity = AtlasEntity(\n", - " name=\"schema-\" + entityname,\n", - " qualified_name = \"pyapacheatlas://\"+\"schema-\" + entityname,\n", - " typeName=\"tabular_schema\",\n", - " guid=guid.get_guid()\n", - " )\n", - "\n", - " df_data_schema = pd.DataFrame(list(zip(list(df_data.columns), list(df_data.dtypes))),columns=['column','dtype'])\n", - "\n", - " #Iterate over the out data frame's columns and create entities\n", - " output_entity_schema_columns = []\n", - " #for column in df.schema:\n", - " for index, row in df_data_schema.iterrows(): \n", - " temp_column = AtlasEntity(\n", - " name = row.column,\n", - " typeName = \"column\",\n", - " qualified_name = \"pyapacheatlas://schema-\" + entityname + \"#\" + row.column,\n", - " guid=guid.get_guid(),\n", - " attributes = {\"type\":str(row.dtype),\"description\": row.column},\n", - " relationshipAttributes = {\"composeSchema\":output_schema_entity.to_json(minimum=True)}\n", - " )\n", - " output_entity_schema_columns.append(temp_column)\n", - "\n", - "\n", - " if parent_entityname:\n", - " dstore_entity = get_entity_details(\"pyapacheatlas://\"+parent_entityname, parent_entitytype)\n", - " # Create a entity for dataset \n", - " dataset_output_entity = AtlasEntity(\n", - " name=entityname,\n", - " typeName=entitytype,\n", - " qualified_name=\"pyapacheatlas://\" + entityname,\n", - " guid = guid.get_guid(),\n", - " relationshipAttributes = {\n", - " \"tabular_schema\": output_schema_entity.to_json(minimum=True),\n", - " \"datastore\":dstore_entity\n", - " }\n", - " )\n", - " else:\n", - " # Create a entity for dataset \n", - " dataset_output_entity = AtlasEntity(\n", - " name=entityname,\n", - " typeName=entitytype,\n", - " qualified_name=\"pyapacheatlas://\" + entityname,\n", - " guid = guid.get_guid(),\n", - " relationshipAttributes = {\n", - " \"tabular_schema\": output_schema_entity.to_json(minimum=True)\n", - " }\n", - " )\n", - "\n", - " # Prepare all the entities as a batch to be uploaded.\n", - " batch = [dataset_output_entity, output_schema_entity] + output_entity_schema_columns\n", - " batch\n", - "\n", - " # Upload all entities!\n", - " client.upload_entities(batch=batch)\n", - " \n", - "def create_data_entity_with_schema(df_data,entityname,entitytype='custom_ml_dataset'):\n", - " # Create an asset for the output data schema.\n", - " output_schema_entity = AtlasEntity(\n", - " name=\"schema-\" + entityname,\n", - " qualified_name = \"pyapacheatlas://\"+\"schema-\" + entityname,\n", - " typeName=\"tabular_schema\",\n", - " guid=guid.get_guid()\n", - " )\n", - "\n", - " df_data_schema = pd.DataFrame(list(zip(list(df_data.columns), list(df_data.dtypes))),columns=['column','dtype'])\n", - "\n", - " #Iterate over the out data frame's columns and create entities\n", - " output_entity_schema_columns = []\n", - " #for column in df.schema:\n", - " for index, row in df_data_schema.iterrows(): \n", - " temp_column = AtlasEntity(\n", - " name = row.column,\n", - " typeName = \"column\",\n", - " qualified_name = \"pyapacheatlas://schema-\" + entityname + \"#\" + row.column,\n", - " guid=guid.get_guid(),\n", - " attributes = {\"type\":str(row.dtype),\"description\": row.column},\n", - " relationshipAttributes = {\"composeSchema\":output_schema_entity.to_json(minimum=True)}\n", - " )\n", - " output_entity_schema_columns.append(temp_column)\n", - "\n", - " # Create a entity for dataset \n", - " dataset_output_entity = AtlasEntity(\n", - " name=entityname,\n", - " typeName=entitytype,\n", - " qualified_name=\"pyapacheatlas://\" + entityname,\n", - " guid = guid.get_guid(),\n", - " relationshipAttributes = {\n", - " \"tabular_schema\": output_schema_entity.to_json(minimum=True)\n", - " }\n", - " )\n", - "\n", - " # Prepare all the entities as a batch to be uploaded.\n", - " batch = [dataset_output_entity, output_schema_entity] + output_entity_schema_columns\n", - " batch\n", - "\n", - " # Upload all entities!\n", - " client.upload_entities(batch=batch)\n", - " \n", - "def create_lineage_for_entities(experimentname,processname,in_ent_qns,out_ent_qns,process_type_name='Process',ColumnMapping=False):\n", - " # create a process \n", - " # inputs: list of (entity,type) tuples\n", - " # outputs: list of (entity,type) tuples\n", - "\n", - " from pyapacheatlas.core import AtlasProcess\n", - "\n", - " in_ent_guids = []\n", - " for in_ent_qn in in_ent_qns:\n", - " #print(in_ent_qn,in_ent_qns[in_ent_qn])\n", - " in_ent_guid = get_entity_guid(in_ent_qn,in_ent_qns[in_ent_qn])\n", - " in_ent_guids.append({'guid':in_ent_guid})\n", - " \n", - " out_ent_guids = []\n", - " for out_ent_qn in out_ent_qns:\n", - " #print(in_ent_qn,in_ent_qns[in_ent_qn])\n", - " out_ent_guid = get_entity_guid(out_ent_qn,out_ent_qns[out_ent_qn])\n", - " out_ent_guids.append({'guid':out_ent_guid})\n", - "\n", - " process_name = experimentname + processname\n", - " process_qn = \"pyapacheatlas://\" + process_name\n", - "\n", - " if ColumnMapping == False:\n", - " process_type_name = process_type_name\n", - "\n", - " process = AtlasProcess(\n", - " name=process_name,\n", - " typeName=process_type_name,\n", - " qualified_name=process_qn,\n", - " inputs = in_ent_guids,\n", - " outputs = out_ent_guids,\n", - " guid=guid.get_guid()\n", - " )\n", - " else:\n", - " process_type_name = \"ProcessWithColumnMapping\"\n", - "\n", - " column_mapping_attributes = []\n", - " for in_ent_qn in in_ent_qns:\n", - " cl_mapping = []\n", - " in_ent_columns = get_entity_schema(get_entity_guid(in_ent_qn,in_ent_qns[in_ent_qn]))\n", - " for in_col in in_ent_columns:\n", - " cl_mapping.append({\"Source\":in_col,\"Sink\":in_col})\n", - " #break\n", - " mapping = {\n", - " 'DatasetMapping': {'Source':in_ent_qn,'Sink':list(out_ent_qns.keys())[0]},\n", - " 'ColumnMapping': cl_mapping\n", - " }\n", - " column_mapping_attributes.append(mapping)\n", - "\n", - " process = AtlasProcess(\n", - " name=process_name,\n", - " typeName=process_type_name,\n", - " qualified_name=process_qn,\n", - " inputs = in_ent_guids,\n", - " outputs = out_ent_guids,\n", - " guid=guid.get_guid(),\n", - " attributes={\"columnMapping\":json.dumps(column_mapping_attributes)}\n", - " )\n", - "\n", - " # Prepare all the entities as a batch to be uploaded.\n", - " batch = [process]\n", - " batch\n", - "\n", - " # Upload all entities!\n", - " client.upload_entities(batch=batch)\n", - " \n", - "def create_entity(name,typeName,config_attibutes):\n", - " # Create an entity\n", - " name = name \n", - " qn = \"pyapacheatlas://\" + name\n", - "\n", - " exp_config_entity = AtlasEntity(\n", - " name=name,\n", - " typeName=typeName,\n", - " qualified_name=qn,\n", - " guid = guid.get_guid(),\n", - " attributes = config_attibutes\n", - " )\n", - "\n", - " # Upload all entities!\n", - " client.upload_entities(batch=[exp_config_entity.to_json()])\n", - "\n", - " \n", - "def get_dataset_details(indataset,experiment_name=''):\n", - " result = []\n", - " #print(indataset)\n", - " if 'FileDataset' in str(type((indataset))):\n", - " dssource = eval(json.loads(str(indataset).replace('FileDataset',''))['source'][0])\n", - " sourcestore = dssource[0]\n", - " sourcepath = dssource[1]\n", - " sourcepathfiles = indataset.to_path()\n", - " for sourcepathfile in sourcepathfiles:\n", - " entityname = sourcepath.split('/')[-1] + sourcepathfile.replace('/','_') #.replace('.parquet','').replace('.csv','')\n", - " #print('\\nFileDataset:',entityname)\n", - "\n", - " dsdatastore = Datastore.get(ws, sourcestore)\n", - " datastore_path = [DataPath(dsdatastore, sourcepath+sourcepathfile.replace('/',''))]\n", - " \n", - " if '.parquet' in sourcepathfile:\n", - " tabular_dataset = Dataset.Tabular.from_parquet_files(path=datastore_path)\n", - " df_data = tabular_dataset.take(10).to_pandas_dataframe()\n", - " \n", - " elif '.csv' in sourcepathfile:\n", - " tabular_dataset = Dataset.Tabular.from_delimited_files(path=datastore_path,encoding ='iso88591') \n", - " #'utf8', 'iso88591', 'latin1', 'ascii', 'utf16', 'utf32', 'utf8bom' and 'windows1252'\n", - " df_data = tabular_dataset.take(10).to_pandas_dataframe()\n", - " \n", - " if experiment_name != '':\n", - " result.append((entityname + '_' + experiment_name,df_data))\n", - " else:\n", - " result.append((entityname,df_data))\n", - "\n", - " elif 'TabularDataset' in str(type((indataset))):\n", - " tabular_dataset = indataset\n", - " entityname = json.loads(str(indataset).replace('TabularDataset',''))['registration']['name']\n", - " \n", - " # dataset = Dataset.get_by_name(ws, name=entityname)\n", - " # try:\n", - " # sourcestore = json.loads(dataset._definition)['blocks'][0]['arguments']['datastore']['datastoreName']\n", - " # except:\n", - " # sourcestore = json.loads(dataset._definition)['blocks'][0]['arguments']['datastores'][0]['datastoreName']\n", - " df_data = tabular_dataset.take(10).to_pandas_dataframe()\n", - " #print('TabularDataset:', entityname)\n", - " result.append((entityname,df_data))\n", - " return result\n", - "\n", - "\n", - "from azureml.core import Experiment\n", - "from azureml.pipeline.core import PipelineRun\n", - "\n", - "from azureml.core import Workspace, Datastore, Dataset\n", - "from azureml.data.datapath import DataPath\n", - "import json \n", - "import pandas as pd\n", - "\n", - "def create_aml_experiment_steps(ws,experiment_name):\n", - " experiments_lst = Experiment.list(ws)\n", - " for experiment in experiments_lst:\n", - " if experiment.name == experiment_name:\n", - " #print(experiment)\n", - " exp = Experiment(ws,experiment.name)\n", - " for run in exp.get_runs(): \n", - " rundetails = run.get_details()\n", - " #print(rundetails)\n", - " if rundetails['status'] != 'Completed': #continue until we find a completed run \n", - " continue\n", - " pipeline_run = PipelineRun(exp, rundetails['runId'])\n", - "\n", - " steps = pipeline_run.get_steps()\n", - " for step_run in steps:\n", - " step_run_details = step_run.get_details_with_logs()\n", - " #print(step_run_details)\n", - " #print(step_run_details['runDefinition']['script'])\n", - "\n", - " purview_basepath = 'pyapacheatlas://'\n", - " in_ent_qns = {}\n", - " out_ent_qns = {}\n", - " #print(step_run_details)\n", - " step_name = step_run.name #step_run_details['runDefinition']['script']\n", - " #print(step_name)\n", - " \n", - " #print('\\n Input Datasets:\\n')\n", - " for indataset in step_run_details['inputDatasets']:\n", - " in_result = get_dataset_details(indataset['dataset'],experiment_name)\n", - " #print(in_result)\n", - " #create entities \n", - " for in_res in in_result:\n", - " data_ent_name = in_res[0].strip('_')\n", - " create_data_entity_with_schema(in_res[1],data_ent_name,'custom_ml_dataset')\n", - " in_ent_qns[purview_basepath + data_ent_name] = 'custom_ml_dataset'\n", - " #break\n", - " #print('\\n Output Datasets:\\n')\n", - " for outdataset in step_run_details['outputDatasets']:\n", - " out_result = get_dataset_details(outdataset['dataset'],experiment_name)\n", - " #print(out_result)\n", - " #create entities\n", - " for out_res in out_result:\n", - " data_ent_name = out_res[0].strip('_')\n", - " create_data_entity_with_schema(out_res[1],data_ent_name,'custom_ml_dataset')\n", - " out_ent_qns[purview_basepath + data_ent_name] = 'custom_ml_dataset'\n", - " #break\n", - " #print(in_ent_qns,out_ent_qns)\n", - " create_lineage_for_entities(experiment_name + '_',step_name, in_ent_qns,out_ent_qns,process_type_name='custom_ml_experiment_step',ColumnMapping=False)\n", - " #break \n", - " \n", - " break # break after processing one completed run\n", - " break #after finding the experiment\n", - "\n", - "\n", - "#create workspace entity\n", - "def create_workspace_entities(ws):\n", - "\n", - " config_attibutes={}\n", - " temp_column={}\n", - "\n", - " temp_column['name'] = ws.name\n", - " config_attibutes.update(temp_column)\n", - " temp_column['subscription_id'] = ws.subscription_id\n", - " config_attibutes.update(temp_column)\n", - " temp_column['resource_group'] = ws.resource_group\n", - " config_attibutes.update(temp_column)\n", - "\n", - " create_entity(ws.name,'custom_ml_workspace',config_attibutes)\n", - " #break\n", - "\n", - "\n", - "#create all datastore entities\n", - "def create_datastore_entities(ws):\n", - " for datastore in ws.datastores.values():\n", - " config_attibutes={}\n", - " temp_column={}\n", - " \n", - " temp_column['name'] = datastore.name\n", - " config_attibutes.update(temp_column)\n", - "\n", - " if ('AzureDataLakeGen2Datastore' in str(type(datastore))) or ('AzureBlobDatastore' in str(type(datastore))):\n", - " temp_column['container_name'] = datastore.container_name\n", - " config_attibutes.update(temp_column)\n", - " temp_column['account_name'] = datastore.account_name\n", - " config_attibutes.update(temp_column)\n", - " temp_column['protocol'] = datastore.protocol\n", - " config_attibutes.update(temp_column)\n", - " temp_column['endpoint'] = datastore.endpoint\n", - " config_attibutes.update(temp_column)\n", - " elif 'AzureSqlDatabaseDatastore' in str(type(datastore)):\n", - " #print('sql',datastore.server_name)\n", - " temp_column['server_name'] = datastore.server_name\n", - " config_attibutes.update(temp_column)\n", - " temp_column['database_name'] = datastore.database_name\n", - " config_attibutes.update(temp_column)\n", - " elif 'AzureBlobDatastore' in str(type(datastore)): \n", - " pass\n", - "\n", - " create_entity(datastore.name,'custom_ml_datastore',config_attibutes)\n", - " #break\n", - "\n", - " #create workspace and datastore relationship\n", - " purview_basepath = 'pyapacheatlas://'\n", - " for datastore in ws.datastores.values():\n", - " relationshiptype = 'custom_ml_workspace_datastore'\n", - " end1type = 'custom_ml_workspace'\n", - " end2type = 'custom_ml_datastore'\n", - " end1_qn = purview_basepath + ws.name\n", - " end2_qn = purview_basepath + datastore.name\n", - " try:\n", - " create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn)\n", - " except:\n", - " pass # ignore if relationship exists\n", - "\n", - "#create all dataset entities (with datastore as parent)\n", - "from azureml.core import Workspace, Datastore, Dataset\n", - "import pandas as pd\n", - "def create_dataset_entities(ws,parent_flag=True):\n", - " purview_basepath = 'pyapacheatlas://'\n", - " for dsname in ws.datasets:\n", - " dataset = ws.datasets[dsname]\n", - " try:\n", - " if 'FileDataset' in str(type((dataset))):\n", - " datasetsource = eval(json.loads(str(dataset).replace('FileDataset',''))['source'][0])[0]\n", - " elif 'TabularDataset' in str(type((dataset))):\n", - " datasetsource = eval(json.loads(str(dataset).replace('TabularDataset',''))['source'][0])[0]\n", - " dsdetails = get_dataset_details(dataset)\n", - " #print(dsdetails)\n", - " for ds in dsdetails:\n", - " if parent_flag == False:\n", - " create_data_entity_with_schema(ds[1],dsname,'custom_ml_dataset')\n", - " create_lineage_for_entities('',('register_' + dsname), {(purview_basepath+datasetsource):'custom_ml_datastore'},\n", - " {(purview_basepath+ds[0]):'custom_ml_dataset'},ColumnMapping=False)\n", - " else: \n", - " create_data_entity_with_schema_and_parent(ds[1],dsname,entitytype='custom_ml_dataset',\n", - " parent_entityname=datasetsource,parent_entitytype='custom_ml_datastore') \n", - " except:\n", - " print('Error:',dsname) \n", - " #break\n", - " \n", - " \n", - "#create experiment entity\n", - "from azureml.core import Experiment\n", - "\n", - "def create_experiment_entities(ws):\n", - " for experiment in Experiment.list(ws):\n", - " #create experiment entity\n", - " config_attibutes={}\n", - " temp_column={}\n", - "\n", - " temp_column['name'] = experiment.name\n", - " config_attibutes.update(temp_column)\n", - "\n", - " create_entity(experiment.name,'custom_ml_experiment',config_attibutes)\n", - " #break\n", - " \n", - " purview_basepath = 'pyapacheatlas://'\n", - "\n", - " #create experiment relationship to workspace\n", - " relationshiptype = 'custom_ml_workspace_experiment'\n", - " end1type = 'custom_ml_workspace'\n", - " end2type = 'custom_ml_experiment'\n", - " end1_qn = purview_basepath + ws.name\n", - " end2_qn = purview_basepath + experiment.name\n", - " try:\n", - " create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn)\n", - " except:\n", - " pass # ignore if relationship exists\n", - " \n", - " for run in experiment.get_runs(): \n", - " rundetails = run.get_details()\n", - " #print(rundetails)\n", - " if rundetails['status'] != 'Completed': #continue until we find a completed run \n", - " continue\n", - " # print(rundetails['properties']['azureml.runsource'])\n", - " #create experiment steps\n", - " if rundetails['properties']['azureml.runsource'] == 'azureml.PipelineRun':\n", - " print(experiment.name)\n", - " create_aml_experiment_steps(ws,experiment.name)\n", - "\n", - " pipeline_run = PipelineRun(experiment, rundetails['runId'])\n", - "\n", - " steps = pipeline_run.get_steps()\n", - " for step_run in steps:\n", - " #print(experiment.name + '_' + step_run.name)\n", - " \n", - " #create experiment relationship to workspace\n", - " relationshiptype = 'custom_ml_experiment_to_experimentstep'\n", - " end1type = 'custom_ml_experiment'\n", - " end2type = 'custom_ml_experiment_step'\n", - " end1_qn = purview_basepath + experiment.name\n", - " end2_qn = purview_basepath + experiment.name + '_' + step_run.name\n", - " try:\n", - " create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn)\n", - " except:\n", - " pass # ignore if relationship exists\n", - "\n", - " break # break after processing one completed run\n", - " #break\n", - "\n", - "def create_entities_relationship(relationshiptype,end1type,end2type,end1_qn,end2_qn):\n", - " relationship = {}\n", - " end1 = {}\n", - " end2 = {}\n", - "\n", - " end1[\"guid\"] = get_entity_guid(end1_qn,end1type)\n", - " end1[\"typeName\"] = end1type\n", - " end1[\"uniqueAttributes\"] = {\"qualifiedName\": end1_qn}\n", - "\n", - " end2[\"guid\"] = get_entity_guid(end2_qn,end2type)\n", - " end2[\"typeName\"] = end2type\n", - " end2[\"uniqueAttributes\"] = {\"qualifiedName\": end2_qn}\n", - "\n", - " relationship[\"typeName\"] = relationshiptype\n", - " relationship[\"attributes\"] = {}\n", - " relationship[\"guid\"] = guid.get_guid()\n", - " relationship[\"provenanceType\"] = 0\n", - " relationship[\"end1\"] = end1\n", - " relationship[\"end2\"] = end2\n", - " relationship\n", - " \n", - " client.upload_relationship(relationship) \n", - " \n", - "def create_package_entities(experimentname,packageslist):\n", - " packages_name = experimentname + '-packages' \n", - " packages_qn = \"pyapacheatlas://\" + packages_name\n", - "\n", - " # Create an asset for the packages.\n", - " packages_entity = AtlasEntity(\n", - " name = packages_name,\n", - " qualified_name = packages_qn,\n", - " typeName=\"custom_ml_packages\",\n", - " attributes = {\"notes\":\"test note\"},\n", - " guid=guid.get_guid()\n", - " )\n", - "\n", - " packages_entity.to_json(minimum=True)\n", - "\n", - " atlas_packages = []\n", - " relationships = []\n", - " for package in packageslist:\n", - " package_attibutes={}\n", - " temp_column={}\n", - " temp_column['programming_language'] = str(package[0])\n", - " package_attibutes.update(temp_column)\n", - " temp_column['package_name'] = str(package[1])\n", - " package_attibutes.update(temp_column)\n", - " temp_column['version'] = str(package[2])\n", - " package_attibutes.update(temp_column)\n", - " temp_column['notes'] = str(package[3])\n", - " package_attibutes.update(temp_column)\n", - "\n", - " # Create an entity for each package\n", - " name = str(package[1]) #experimentname + '-package-' + package[1] \n", - " qn = packages_qn + '#' + str(package[1]) #\"pyapacheatlas://\" + name\n", - "\n", - " package_entity = AtlasEntity(\n", - " name= name,\n", - " typeName=\"custom_ml_package\",\n", - " qualified_name=qn,\n", - " guid = guid.get_guid(),\n", - " attributes = package_attibutes,\n", - " relationshipAttributes = {\"packages\":packages_entity.to_json(minimum=True)}\n", - " )\n", - " atlas_packages.append(package_entity)\n", - "\n", - " atlas_packages\n", - "\n", - " # Prepare all the entities as a batch to be uploaded.\n", - " batch = [packages_entity] + atlas_packages\n", - " client.upload_entities(batch=batch) \n", - " \n", - "def create_experiment_config_entity(ws,experiment_name,automl_run):\n", - " # Get experiment config from AML run\n", - " import json\n", - " import pandas as pd\n", - " run_properties = automl_run.get_properties()\n", - " run_properties\n", - "\n", - " AMLSettingsJsonString = run_properties['AMLSettingsJsonString']\n", - " AMLSettings = json.loads(AMLSettingsJsonString)\n", - "\n", - " df_config = pd.DataFrame(list(AMLSettings.items()),columns = ['key','value']) \n", - "\n", - " keys = ['task_type','enable_early_stopping','experiment_timeout_minutes','primary_metric','compute_target','label_column_name','n_cross_validations','model_explainability']\n", - "\n", - " df_config = df_config[df_config['key'].isin(keys)]\n", - "\n", - " dict_config = df_config.to_dict(orient = 'records')\n", - " dict_config\n", - "\n", - " config_attibutes={}\n", - " for attibutes in dict_config:\n", - " temp_column={}\n", - " temp_column[attibutes['key']] = attibutes['value']\n", - " config_attibutes.update(temp_column)\n", - " config_attibutes\n", - "\n", - " # Create a entity for exp config \n", - " name = experiment_name + \"-config\"\n", - " qn = \"pyapacheatlas://\" + name\n", - "\n", - " exp_config_entity = AtlasEntity(\n", - " name=name,\n", - " typeName=\"custom_ml_exp_config\",\n", - " qualified_name=qn,\n", - " guid = guid.get_guid(),\n", - " attributes = config_attibutes\n", - " )\n", - "\n", - " # Upload all entities!\n", - " client.upload_entities(batch=[exp_config_entity.to_json()])\n", - " \n", - "def create_model_entity(ws,experiment_name,modelname):\n", - " # get deployed model\n", - " from azureml.core.model import Model\n", - " model = Model(ws, modelname)\n", - "\n", - " config_attibutes={}\n", - " temp_column={}\n", - " temp_column['workspace_name'] = model.workspace.name\n", - " config_attibutes.update(temp_column)\n", - " temp_column['workspace_subscription_id'] = model.workspace.subscription_id\n", - " config_attibutes.update(temp_column)\n", - " temp_column['workspace_subscription_id'] = model.workspace.subscription_id\n", - " config_attibutes.update(temp_column)\n", - " temp_column['workspace_resource_group'] = model.workspace.resource_group\n", - " config_attibutes.update(temp_column)\n", - " temp_column['name'] = model.name\n", - " config_attibutes.update(temp_column)\n", - " temp_column['id'] = model.id\n", - " config_attibutes.update(temp_column)\n", - " temp_column['version'] = model.version\n", - " config_attibutes.update(temp_column)\n", - " temp_column['tags'] = model.tags\n", - " config_attibutes.update(temp_column)\n", - " temp_column['properties'] = model.properties\n", - " config_attibutes.update(temp_column)\n", - "\n", - " # Create a entity for Model\n", - " name = modelname \n", - " qn = \"pyapacheatlas://\" + name\n", - "\n", - " exp_config_entity = AtlasEntity(\n", - " name=name,\n", - " typeName=\"custom_ml_model\",\n", - " qualified_name=qn,\n", - " guid = guid.get_guid(),\n", - " attributes = config_attibutes\n", - " )\n", - "\n", - " # Upload all entities!\n", - " client.upload_entities(batch=[exp_config_entity.to_json()]) \n", - " \n", - "def create_model_metrics_entity(experiment_name,best_run):\n", - " metrics = best_run.get_metrics()\n", - "\n", - " # select relevant metrics\n", - " auc = metrics.get('AUC_weighted')\n", - " accuracy = metrics.get('accuracy')\n", - " precision = metrics.get('precision_score_weighted')\n", - " recall = metrics.get('recall_score_weighted')\n", - " f1 = metrics.get('f1_score_weighted')\n", - "\n", - " # # combine into single dataframe\n", - " # metrics_df = sc.parallelize([['AUC', auc], ['Accuracy', accuracy], ['Precision', precision], ['Recall', recall], ['F1', f1]]).toDF(('Metric', 'Value'))\n", - " metrics = ['AUC','Accuracy','Precision','Recall','F1']\n", - " metricslist= [auc,accuracy,precision,recall,f1]\n", - " columns = ['Metric','Value']\n", - " metrics_df = pd.DataFrame(zip(metrics, metricslist),columns=columns)\n", - "\n", - "\n", - " dict_metrics = metrics_df.to_dict(orient = 'records')\n", - " dict_metrics\n", - "\n", - " config_attibutes={}\n", - " for attibutes in dict_metrics:\n", - " temp_column={}\n", - " temp_column[attibutes['Metric']] = attibutes['Value']\n", - " config_attibutes.update(temp_column)\n", - " config_attibutes\n", - "\n", - " name = experiment_name + \"-modelmetrics\"\n", - " qn = \"pyapacheatlas://\" + name\n", - "\n", - " # Create a entity for model metrics\n", - " exp_config_entity = AtlasEntity(\n", - " name=name,\n", - " typeName=\"custom_ml_model_metrics\",\n", - " qualified_name=qn,\n", - " guid = guid.get_guid(),\n", - " attributes = config_attibutes\n", - " )\n", - "\n", - " # Upload all entities!\n", - " client.upload_entities(batch=[exp_config_entity.to_json()])\n", - " \n", - "def create_experiment_lineage(experimentname,exp_data_qn,exp_config_qn,model_metrics_qn,model_qn): \n", - " # create experiment process \n", - " # inputs: prepareddata, modelconfig \n", - " # outputs: model metrics and registered model\n", - "\n", - " from pyapacheatlas.core import AtlasProcess\n", - "\n", - " in_data_ent_guid = get_entity_guid(exp_data_qn,'custom_dataset')\n", - " in_exp_config_guid = get_entity_guid(exp_config_qn,'custom_ml_exp_config')\n", - " out_model_metrics_guid = get_entity_guid(model_metrics_qn,'custom_ml_model_metrics')\n", - " out_model_guid = get_entity_guid(model_qn,'custom_ml_model')\n", - "\n", - " process_name = experimentname + '-train'\n", - " process_qn = \"pyapacheatlas://\" + process_name\n", - " process_type_name = \"Process\"\n", - "\n", - " process = AtlasProcess(\n", - " name=process_name,\n", - " typeName=process_type_name,\n", - " qualified_name=process_qn,\n", - " inputs = [{\"guid\":in_data_ent_guid},{\"guid\":in_exp_config_guid}],\n", - " outputs = [{\"guid\":out_model_metrics_guid},{\"guid\":out_model_guid}],\n", - " guid=guid.get_guid()\n", - " )\n", - "\n", - " # Prepare all the entities as a batch to be uploaded.\n", - " batch = [process]\n", - " batch\n", - "\n", - " # Upload all entities!\n", - " client.upload_entities(batch=batch) \n", - " \n", - "def create_model_service_entity(ws,experimentname,aci_service_name,samplejson):\n", - " # get deployed ACI Web Service\n", - " from azureml.core.webservice import AciWebservice\n", - " aciws = AciWebservice(ws, aci_service_name)\n", - "\n", - " config_attibutes={}\n", - " temp_column={}\n", - " temp_column['workspace_name'] = aciws.workspace.name\n", - " config_attibutes.update(temp_column)\n", - " temp_column['workspace_subscription_id'] = aciws.workspace.subscription_id\n", - " config_attibutes.update(temp_column)\n", - " temp_column['workspace_resource_group'] = aciws.workspace.resource_group\n", - " config_attibutes.update(temp_column)\n", - " temp_column['name'] = aciws.name\n", - " config_attibutes.update(temp_column)\n", - " temp_column['image_id'] = aciws.image_id\n", - " config_attibutes.update(temp_column)\n", - " temp_column['compute_type'] = aciws.compute_type\n", - " config_attibutes.update(temp_column)\n", - " temp_column['state'] = aciws.state\n", - " config_attibutes.update(temp_column)\n", - " temp_column['scoring_uri'] = aciws.scoring_uri\n", - " config_attibutes.update(temp_column)\n", - " temp_column['tags'] = aciws.tags\n", - " config_attibutes.update(temp_column)\n", - " temp_column['state'] = aciws.state\n", - " config_attibutes.update(temp_column)\n", - " temp_column['properties'] = aciws.properties\n", - " config_attibutes.update(temp_column)\n", - " temp_column['created_by'] = aciws.created_by\n", - " config_attibutes.update(temp_column)\n", - " temp_column['sample_json'] = samplejson\n", - " config_attibutes.update(temp_column)\n", - "\n", - " name = experimentname + \"-model_endpoint\"\n", - " qn = \"pyapacheatlas://\" + name\n", - "\n", - " # Create a entity for ACI Web Service\n", - " endpoint_entity = AtlasEntity(\n", - " name=name,\n", - " typeName=\"custom_ml_model_endpoint\",\n", - " qualified_name=qn,\n", - " guid = guid.get_guid(),\n", - " attributes = config_attibutes\n", - " )\n", - "\n", - " # Upload all entities!\n", - " client.upload_entities(batch=[endpoint_entity.to_json()]) \n", - " \n", - "def create_powerbi_dataset_and_lineage(experiment_name,pbi_workspace,pbi_datasetid,pbidata_ent_name,ml_dataset_ent_name,ml_dataset_ent_type):\n", - " \n", - " pbidata_entity_type = 'powerbi_dataset'\n", - " pbidata_ent_qn = pbi_workspace + '/datasets/' + pbi_datasetid \n", - " purview_basepath = 'pyapacheatlas://'\n", - " #\"https://msit.powerbi.com/groups/7d666287-f9b8-45ff-be6c-9909afe9df40/datasets/e5a30c22-466d-4a30-a1ac-8736ed6567cc\"\n", - "\n", - " pbidata_ent = AtlasEntity(\n", - " name=pbidata_ent_name,\n", - " typeName=pbidata_entity_type,\n", - " qualified_name= pbidata_ent_qn,\n", - " workspace = pbi_workspace,\n", - " guid = guid.get_guid()\n", - " )\n", - "\n", - " # Prepare all the entities as a batch to be uploaded.\n", - " batch = [pbidata_ent]\n", - " batch\n", - "\n", - " # Upload all entities!\n", - " client.upload_entities(batch=batch)\n", - "\n", - " #cretae powerbi_dataset_process lineage\n", - " in_ent_guids = []\n", - " in_ent_guid = get_entity_guid(purview_basepath + ml_dataset_ent_name,ml_dataset_ent_type)\n", - " in_ent_guids.append({'guid':in_ent_guid})\n", - "\n", - " out_ent_guids = []\n", - " out_ent_guid = get_entity_guid(pbidata_ent_qn,pbidata_entity_type)\n", - " out_ent_guids.append({'guid':out_ent_guid})\n", - "\n", - " process_name = 'createpowerbidataset' + pbidata_ent_name + experiment_name\n", - " process_qn = \"pyapacheatlas://\" + process_name\n", - " process_type_name = \"powerbi_dataset_process\"\n", - "\n", - " process = AtlasProcess(\n", - " name=process_name,\n", - " typeName=process_type_name,\n", - " qualified_name=process_qn,\n", - " inputs = in_ent_guids,\n", - " outputs = out_ent_guids,\n", - " guid=guid.get_guid()\n", - " )\n", - "\n", - " # Prepare all the entities as a batch to be uploaded.\n", - " batch = [process]\n", - " batch\n", - "\n", - " # Upload all entities!\n", - " client.upload_entities(batch=batch)\n", - " \n", - "def create_powerbi_report_and_lineage(experiment_name,pbi_workspace,pbi_reportid,pbi_ent_name,pbi_datasetid):\n", - "\n", - " #create powerbi report\n", - " pbi_entity_type = 'powerbi_report'\n", - " pbi_ent_qn = pbi_workspace + '/reports/' + pbi_reportid \n", - " purview_basepath = 'pyapacheatlas://'\n", - " \n", - " pbi_ent = AtlasEntity(\n", - " name=pbi_ent_name,\n", - " typeName=pbi_entity_type,\n", - " qualified_name= pbi_ent_qn, \n", - " workspace = pbi_workspace,\n", - " guid = guid.get_guid()\n", - " )\n", - "\n", - " # Prepare all the entities as a batch to be uploaded.\n", - " batch = [pbi_ent]\n", - " batch\n", - "\n", - " # Upload all entities!\n", - " client.upload_entities(batch=batch)\n", - "\n", - " #create powerbi dashboard process lineage\n", - " pbidata_ent_qn = pbi_workspace + '/datasets/' + pbi_datasetid \n", - " in_ent_guids = []\n", - " in_ent_guid = get_entity_guid(pbidata_ent_qn,'powerbi_dataset')\n", - " in_ent_guids.append({'guid':in_ent_guid})\n", - "\n", - " out_ent_guids = []\n", - " out_ent_guid = get_entity_guid(pbi_ent_qn,'powerbi_report')\n", - " out_ent_guids.append({'guid':out_ent_guid})\n", - "\n", - " process_name = 'createpowerbireport' + pbi_ent_name + experiment_name\n", - " process_qn = \"pyapacheatlas://\" + process_name\n", - " process_type_name = \"powerbi_report_process\"\n", - "\n", - " process = AtlasProcess(\n", - " name=process_name,\n", - " typeName=process_type_name,\n", - " qualified_name=process_qn,\n", - " inputs = in_ent_guids,\n", - " outputs = out_ent_guids,\n", - " guid=guid.get_guid()\n", - " )\n", - "\n", - " # Prepare all the entities as a batch to be uploaded.\n", - " batch = [process]\n", - " batch\n", - "\n", - " # Upload all entities!\n", - " client.upload_entities(batch=batch)\n", - " \n", - "# clean up datasets\n", - "def cleanup_entities(typename, entitytype):\n", - " filter_setup = {\"typeName\": typename, \"includeSubTypes\": True}\n", - " search = client.search_entities(\"*\", search_filter=filter_setup)\n", - " for entity in search:\n", - " #print(entity)\n", - " if entity.get(\"entityType\") == entitytype:\n", - " print(entity.get(\"id\"),entity.get(\"qualifiedName\"),entity.get(\"entityType\"))\n", - " guid = entity.get(\"id\")\n", - " client.delete_entity(guid=guid)\n", - "\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" }, - "save_output": true, - "synapse_widget": { - "state": {}, - "version": "0.1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/SynapseNotebooks/04_Create_CreditRisk_Experiment.ipynb b/SynapseNotebooks/04_Create_CreditRisk_Experiment.ipynb index 9a7d6d7..ae3d40a 100644 --- a/SynapseNotebooks/04_Create_CreditRisk_Experiment.ipynb +++ b/SynapseNotebooks/04_Create_CreditRisk_Experiment.ipynb @@ -1,590 +1,599 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + "cells": [ + { + "cell_type": "markdown", + "source": [ + "Copyright (c) Microsoft. All rights reserved.\r\n", + "\r\n", + "Licensed under the MIT license." + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [], - "source": [ - "%run /01_Authenticate_to_Purview_AML" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + { + "cell_type": "code", + "source": [ + "%run /01_Authenticate_to_Purview_AML" + ], + "outputs": [], + "execution_count": 1, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [], - "source": [ - "%run /02_Create_ML_Lineage_Types" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + { + "cell_type": "code", + "source": [ + "%run /02_Create_ML_Lineage_Types" + ], + "outputs": [], + "execution_count": 2, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [], - "source": [ - "%run /03_Create_ML_Lineage_Functions" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import json" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "#update below variables with synapse adls name and container/filesystem name\n", - "data_lake_account_name = \"\"\n", - "file_system_name = \"data\"" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + { + "cell_type": "code", + "source": [ + "%run /03_Create_ML_Lineage_Functions" + ], + "outputs": [], + "execution_count": 3, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "synapse_base_path = 'abfss://' + file_system_name + '@' + data_lake_account_name + '.dfs.core.windows.net'\n", - "df_borrower = spark.read.load(synapse_base_path+ '/creditriskdata/borrower.csv', format='csv', header=True).toPandas()\n", - "#display(df_borrower.head(10))\n", - "\n", - "df_loan = spark.read.load(synapse_base_path + '/creditriskdata/loan.csv', format='csv', header=True).toPandas()\n", - "#display(df_loan.head(1))\n", - "\n", - "# Join data and do some transformations\n", - "df_data = df_borrower.merge(df_loan,on='memberId',how='inner')\n", - "df_data.shape\n", - "\n", - "df_sp = spark.createDataFrame(df_data)\n", - "df_sp = df_sp.drop('loanStatus')\n", - "\n", - "df_sp.write.option('header', 'true').mode('overwrite').csv(synapse_base_path + '/creditriskdata/testdata/')\n", - "\n", - "df_data['homeOwnership'] = df_data['homeOwnership'].replace('nan', np.nan).fillna(0)\n", - "df_data['isJointApplication'] = df_data['isJointApplication'].replace('nan', np.nan).fillna(0)\n", - "\n", - "drop_cols = ['memberId', 'loanId', 'date','grade']\n", - "df_data = df_data.drop(drop_cols, axis=1)\n", - "#df_data.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + { + "cell_type": "code", + "source": [ + "import json" + ], + "outputs": [], + "execution_count": 2, + "metadata": {} }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [], - "source": [ - "experimentname = \"CreditRiskExperiment\"" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + { + "cell_type": "code", + "source": [ + "#update below variables with synapse adls name and container/filesystem name\n", + "data_lake_account_name = \"\"\n", + "file_system_name = \"data\"" + ], + "outputs": [], + "execution_count": 3, + "metadata": {} }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [], - "source": [ - "#create an entity for prepated data\n", - "data_ent_name = 'creditriskdata'\n", - "create_data_entity_with_schema(df_data,data_ent_name,'custom_dataset')\n", - "\n", - "#create preprocess lineage \n", - "\n", - "syn_basepath = 'https://' + data_lake_account_name + '.dfs.core.windows.net/' + file_system_name + '/creditriskdata'\n", - "purview_basepath = 'pyapacheatlas://'\n", - "\n", - "in_ent_qns = {syn_basepath + '/borrower.csv':'azure_datalake_gen2_path',syn_basepath + '/loan.csv':'azure_datalake_gen2_path'}\n", - "out_ent_qns = {purview_basepath + data_ent_name:'custom_dataset'}\n", - "\n", - "processname = '-preprocess'\n", - "create_lineage_for_entities(experimentname,processname, in_ent_qns,out_ent_qns,ColumnMapping=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "synapse_base_path = 'abfss://' + file_system_name + '@' + data_lake_account_name + '.dfs.core.windows.net'\n", + "df_borrower = spark.read.load(synapse_base_path+ '/creditriskdata/borrower.csv', format='csv', header=True).toPandas()\n", + "#display(df_borrower.head(10))\n", + "\n", + "df_loan = spark.read.load(synapse_base_path + '/creditriskdata/loan.csv', format='csv', header=True).toPandas()\n", + "#display(df_loan.head(1))\n", + "\n", + "# Join data and do some transformations\n", + "df_data = df_borrower.merge(df_loan,on='memberId',how='inner')\n", + "df_data.shape\n", + "\n", + "df_sp = spark.createDataFrame(df_data)\n", + "df_sp = df_sp.drop('loanStatus')\n", + "\n", + "df_sp.write.option('header', 'true').mode('overwrite').csv(synapse_base_path + '/creditriskdata/testdata/')\n", + "\n", + "df_data['homeOwnership'] = df_data['homeOwnership'].replace('nan', np.nan).fillna(0)\n", + "df_data['isJointApplication'] = df_data['isJointApplication'].replace('nan', np.nan).fillna(0)\n", + "\n", + "drop_cols = ['memberId', 'loanId', 'date','grade']\n", + "df_data = df_data.drop(drop_cols, axis=1)\n", + "#df_data.dtypes" + ], + "outputs": [], + "execution_count": 4, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + } }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [], - "source": [ - "from azureml.core.experiment import Experiment\n", - "from azureml.train.automl.run import AutoMLRun\n", - "from azureml.train.automl import AutoMLConfig\n", - "\n", - "##run only once\n", - "experiment = Experiment(ws, experimentname)\n", - "\n", - "automl_classifier_config = AutoMLConfig(\n", - " task='classification', \n", - " enable_early_stopping = True, \n", - " iterations = 2, \n", - " experiment_timeout_minutes=15,\n", - " primary_metric='AUC_weighted',\n", - " training_data= df_data,\n", - " #compute = 'local',\n", - " label_column_name='loanStatus',\n", - " n_cross_validations=5,\n", - " model_explainability=True,\n", - " enable_onnx_compatible_models=True,\n", - " enable_voting_ensemble=False,\n", - " enable_stack_ensemble=False\n", - " )\n", - "local_run = experiment.submit(automl_classifier_config, show_output=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + { + "cell_type": "code", + "source": [ + "experimentname = \"CreditRiskExperiment\"" + ], + "outputs": [], + "execution_count": 6, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [], - "source": [ - "# get experiment run, get the best model and register\n", - "\n", - "from azureml.core.experiment import Experiment\n", - "from azureml.core.workspace import Workspace\n", - "from azureml.train.automl.run import AutoMLRun\n", - "from azureml.train.automl import AutoMLConfig\n", - "from azureml.core.model import Model\n", - "import joblib\n", - "\n", - "# get experiment run, get the best model and register\n", - "experimentname = \"CreditRiskExperiment\"\n", - "\n", - "for automl_run in ws.experiments[experimentname].get_runs():\n", - " best_run, fitted_model = automl_run.get_output() # We are taking the first run. You can update this if you like to take a different run\n", - " break\n", - "\n", - "#save the model to a local file\n", - "model_path = 'creditrisk_model'\n", - "joblib.dump(fitted_model, model_path)\n", - "\n", - "model_name = \"creditrisk_model\"\n", - "registered_model = Model.register(model_path = model_path, # this points to a local file\n", - " model_name = model_name, # name the model is registered as\n", - " tags = {'type': \"classification\"}, \n", - " description = \"Credit Risk Classifier\", \n", - " workspace = ws)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + { + "cell_type": "code", + "source": [ + "#create an entity for prepated data\n", + "data_ent_name = 'creditriskdata'\n", + "create_data_entity_with_schema(df_data,data_ent_name,'custom_dataset')\n", + "\n", + "#create preprocess lineage \n", + "\n", + "syn_basepath = 'https://' + data_lake_account_name + '.dfs.core.windows.net/' + file_system_name + '/creditriskdata'\n", + "purview_basepath = 'pyapacheatlas://'\n", + "\n", + "in_ent_qns = {syn_basepath + '/borrower.csv':'azure_datalake_gen2_path',syn_basepath + '/loan.csv':'azure_datalake_gen2_path'}\n", + "out_ent_qns = {purview_basepath + data_ent_name:'custom_dataset'}\n", + "\n", + "processname = '-preprocess'\n", + "create_lineage_for_entities(experimentname,processname, in_ent_qns,out_ent_qns,ColumnMapping=True)" + ], + "outputs": [], + "execution_count": 7, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [], - "source": [ - "#create packages entities\n", - "#[programming_language,package_name,version,notes]\n", - "packageslist = [['python','mmlspark','v0.0.11','older versions before 0.0.10 give error'],\n", - " ['python','scikit-learn','0.22rc2.post1','latest version 0.24.x gives error if you call the model from Azure Function']]\n", - "create_package_entities(experimentname,packageslist)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + { + "cell_type": "code", + "source": [ + "from azureml.core.experiment import Experiment\n", + "from azureml.train.automl.run import AutoMLRun\n", + "from azureml.train.automl import AutoMLConfig\n", + "\n", + "##run only once\n", + "experiment = Experiment(ws, experimentname)\n", + "\n", + "automl_classifier_config = AutoMLConfig(\n", + " task='classification', \n", + " enable_early_stopping = True, \n", + " iterations = 2, \n", + " experiment_timeout_minutes=15,\n", + " primary_metric='AUC_weighted',\n", + " training_data= df_data,\n", + " #compute = 'local',\n", + " label_column_name='loanStatus',\n", + " n_cross_validations=5,\n", + " model_explainability=True,\n", + " enable_onnx_compatible_models=True,\n", + " enable_voting_ensemble=False,\n", + " enable_stack_ensemble=False\n", + " )\n", + "local_run = experiment.submit(automl_classifier_config, show_output=True)" + ], + "outputs": [], + "execution_count": 8, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [], - "source": [ - "#create experiment train lineage\n", - "create_experiment_config_entity(ws,experimentname,automl_run)\n", - "create_model_entity(ws,experimentname,model_name)\n", - "create_model_metrics_entity(experimentname,best_run)\n", - "\n", - "pbasepath = 'pyapacheatlas://'\n", - "\n", - "in_ent_qns = {pbasepath + data_ent_name:'custom_dataset',pbasepath + experimentname + \"-config\":'custom_ml_exp_config',pbasepath + experimentname + '-packages':'custom_ml_packages'}\n", - "out_ent_qns = {pbasepath + model_name:'custom_ml_model',pbasepath + experimentname + \"-modelmetrics\":'custom_ml_model_metrics'}\n", - "\n", - "processname = '-train'\n", - "create_lineage_for_entities(experimentname,processname, in_ent_qns,out_ent_qns,ColumnMapping=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + { + "cell_type": "code", + "source": [ + "# get experiment run, get the best model and register\n", + "\n", + "from azureml.core.experiment import Experiment\n", + "from azureml.core.workspace import Workspace\n", + "from azureml.train.automl.run import AutoMLRun\n", + "from azureml.train.automl import AutoMLConfig\n", + "from azureml.core.model import Model\n", + "import joblib\n", + "\n", + "# get experiment run, get the best model and register\n", + "experimentname = \"CreditRiskExperiment\"\n", + "\n", + "for automl_run in ws.experiments[experimentname].get_runs():\n", + " best_run, fitted_model = automl_run.get_output() # We are taking the first run. You can update this if you like to take a different run\n", + " break\n", + "\n", + "#save the model to a local file\n", + "model_path = 'creditrisk_model'\n", + "joblib.dump(fitted_model, model_path)\n", + "\n", + "model_name = \"creditrisk_model\"\n", + "registered_model = Model.register(model_path = model_path, # this points to a local file\n", + " model_name = model_name, # name the model is registered as\n", + " tags = {'type': \"classification\"}, \n", + " description = \"Credit Risk Classifier\", \n", + " workspace = ws)\n" + ], + "outputs": [], + "execution_count": 10, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [], - "source": [ - "scoring_script = \"\"\"\n", - "import json\n", - "import pickle\n", - "import numpy as np\n", - "import pandas as pd\n", - "import azureml.train.automl\n", - "from sklearn.externals import joblib\n", - "from azureml.core.model import Model\n", - "\n", - "def init():\n", - " global model\n", - " # This name is model.id of model that we want to deploy deserialize the model file back\n", - " model_path = Model.get_model_path(model_name = 'creditrisk_model')\n", - " model = joblib.load(model_path)\n", - "\n", - "def run(input_json): \n", - " try:\n", - " data_df = pd.read_json(input_json) \n", - " # Get the predictions...\n", - " prediction = model.predict(data_df)\n", - " prediction = json.dumps(prediction.tolist())\n", - " except Exception as e:\n", - " prediction = str(e)\n", - " return prediction\n", - "\"\"\"\n", - "exec(scoring_script)\n", - "with open(\"scoring_script.py\", \"w\") as file:\n", - " file.write(scoring_script)\n", - " \n", - "scoring_script_file_name = 'scoring_script.py'\n", - "\n", - "#test locally\n", - "import numpy as np\n", - "# X_test = spark.sql('select * from default.creditrisk_data limit 20').toPandas()\n", - "drop_cols = ['loanStatus']\n", - "X_test = df_data.drop(drop_cols, axis=1)\n", - "X_test = X_test.head(1)\n", - "json_test_data = X_test.to_json(orient='records')\n", - "print(json_test_data)\n", - "init()\n", - "run(json_test_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + { + "cell_type": "code", + "source": [ + "#create packages entities\n", + "#[programming_language,package_name,version,notes]\n", + "packageslist = [['python','mmlspark','v0.0.11','older versions before 0.0.10 give error'],\n", + " ['python','scikit-learn','0.22rc2.post1','latest version 0.24.x gives error if you call the model from Azure Function']]\n", + "create_package_entities(experimentname,packageslist)" + ], + "outputs": [], + "execution_count": 11, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [], - "source": [ - "# obtain conda dependencies from the automl run and save the file locally\n", - "from azureml.core import Environment\n", - "environment_config_file = 'creditrisk_conda_env.yml'\n", - "best_run.download_file('outputs/conda_env_v_1_0_0.yml', environment_config_file)\n", - "# with open('creditrisk_conda_env.yml', 'r') as f:\n", - "# print(f.read())\n", - "\n", - "# create the environment based on the saved conda dependencies file\n", - "myenv = Environment.from_conda_specification(name=\"creditriskenv\", file_path=environment_config_file)\n", - "myenv.register(workspace=ws)\n", - "\n", - "from azureml.core.model import InferenceConfig\n", - "from azureml.core.webservice import AciWebservice\n", - "from azureml.core.webservice import Webservice\n", - "\n", - "# Configure and deploy the web service to Azure Container Instances\n", - "inference_config = InferenceConfig(environment=myenv, entry_script=scoring_script_file_name)\n", - "aci_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb= 2, tags = { 'type' : 'automl-classification'}, description='AutoML Credit Risk Classifier Service')\n", - "aci_service_name = 'creditrisk-automl-service'\n", - "aci_service = Model.deploy(ws, aci_service_name, [registered_model], inference_config, aci_config)\n", - "aci_service.wait_for_deployment(show_output = True)\n", - "print(aci_service.state)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + { + "cell_type": "code", + "source": [ + "#create experiment train lineage\n", + "create_experiment_config_entity(ws,experimentname,automl_run)\n", + "create_model_entity(ws,experimentname,model_name)\n", + "create_model_metrics_entity(experimentname,best_run)\n", + "\n", + "pbasepath = 'pyapacheatlas://'\n", + "\n", + "in_ent_qns = {pbasepath + data_ent_name:'custom_dataset',pbasepath + experimentname + \"-config\":'custom_ml_exp_config',pbasepath + experimentname + '-packages':'custom_ml_packages'}\n", + "out_ent_qns = {pbasepath + model_name:'custom_ml_model',pbasepath + experimentname + \"-modelmetrics\":'custom_ml_model_metrics'}\n", + "\n", + "processname = '-train'\n", + "create_lineage_for_entities(experimentname,processname, in_ent_qns,out_ent_qns,ColumnMapping=False)" + ], + "outputs": [], + "execution_count": 12, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [], - "source": [ - "aci_service_name = 'creditrisk-automl-service'\n", - "create_model_service_entity(ws,experimentname,aci_service_name,json_test_data)\n", - "\n", - "pbasepath = 'pyapacheatlas://'\n", - "\n", - "in_ent_qns = {pbasepath + model_name:'custom_ml_model'}\n", - "out_ent_qns = {pbasepath + experimentname + \"-model_endpoint\":'custom_ml_model_endpoint'}\n", - "\n", - "processname = '-deploymodel'\n", - "create_lineage_for_entities(experimentname,processname, in_ent_qns,out_ent_qns,ColumnMapping=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + { + "cell_type": "code", + "source": [ + "scoring_script = \"\"\"\n", + "import json\n", + "import pickle\n", + "import numpy as np\n", + "import pandas as pd\n", + "import azureml.train.automl\n", + "from sklearn.externals import joblib\n", + "from azureml.core.model import Model\n", + "\n", + "def init():\n", + " global model\n", + " # This name is model.id of model that we want to deploy deserialize the model file back\n", + " model_path = Model.get_model_path(model_name = 'creditrisk_model')\n", + " model = joblib.load(model_path)\n", + "\n", + "def run(input_json): \n", + " try:\n", + " data_df = pd.read_json(input_json) \n", + " # Get the predictions...\n", + " prediction = model.predict(data_df)\n", + " prediction = json.dumps(prediction.tolist())\n", + " except Exception as e:\n", + " prediction = str(e)\n", + " return prediction\n", + "\"\"\"\n", + "exec(scoring_script)\n", + "with open(\"scoring_script.py\", \"w\") as file:\n", + " file.write(scoring_script)\n", + " \n", + "scoring_script_file_name = 'scoring_script.py'\n", + "\n", + "#test locally\n", + "import numpy as np\n", + "# X_test = spark.sql('select * from default.creditrisk_data limit 20').toPandas()\n", + "drop_cols = ['loanStatus']\n", + "X_test = df_data.drop(drop_cols, axis=1)\n", + "X_test = X_test.head(1)\n", + "json_test_data = X_test.to_json(orient='records')\n", + "print(json_test_data)\n", + "init()\n", + "run(json_test_data)" + ], + "outputs": [], + "execution_count": 13, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "# obtain conda dependencies from the automl run and save the file locally\n", + "from azureml.core import Environment\n", + "environment_config_file = 'creditrisk_conda_env.yml'\n", + "best_run.download_file('outputs/conda_env_v_1_0_0.yml', environment_config_file)\n", + "# with open('creditrisk_conda_env.yml', 'r') as f:\n", + "# print(f.read())\n", + "\n", + "# create the environment based on the saved conda dependencies file\n", + "myenv = Environment.from_conda_specification(name=\"creditriskenv\", file_path=environment_config_file)\n", + "myenv.register(workspace=ws)\n", + "\n", + "from azureml.core.model import InferenceConfig\n", + "from azureml.core.webservice import AciWebservice\n", + "from azureml.core.webservice import Webservice\n", + "\n", + "# Configure and deploy the web service to Azure Container Instances\n", + "inference_config = InferenceConfig(environment=myenv, entry_script=scoring_script_file_name)\n", + "aci_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb= 2, tags = { 'type' : 'automl-classification'}, description='AutoML Credit Risk Classifier Service')\n", + "aci_service_name = 'creditrisk-automl-service'\n", + "aci_service = Model.deploy(ws, aci_service_name, [registered_model], inference_config, aci_config)\n", + "aci_service.wait_for_deployment(show_output = True)\n", + "print(aci_service.state)" + ], + "outputs": [], + "execution_count": 14, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "aci_service_name = 'creditrisk-automl-service'\n", + "create_model_service_entity(ws,experimentname,aci_service_name,json_test_data)\n", + "\n", + "pbasepath = 'pyapacheatlas://'\n", + "\n", + "in_ent_qns = {pbasepath + model_name:'custom_ml_model'}\n", + "out_ent_qns = {pbasepath + experimentname + \"-model_endpoint\":'custom_ml_model_endpoint'}\n", + "\n", + "processname = '-deploymodel'\n", + "create_lineage_for_entities(experimentname,processname, in_ent_qns,out_ent_qns,ColumnMapping=False)" + ], + "outputs": [], + "execution_count": 16, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } }, - "nteract": { - "transient": { - "deleting": false - } + { + "cell_type": "code", + "source": [ + "#batch inferencing\n", + "df_test = spark.read.load(synapse_base_path +'/creditriskdata/testdata', format='csv', header=True).toPandas()\n", + "\n", + "drop_cols = ['memberId', 'loanId', 'date','grade']\n", + "df_test1 = df_test.drop(drop_cols, axis=1)\n", + "\n", + "model_path = Model.get_model_path(model_name = 'creditrisk_model')\n", + "model = joblib.load(model_path)\n", + "\n", + "prediction = model.predict(df_test1)\n", + "prediction\n", + "\n", + "df_result = df_test \n", + "df_result['prediction'] = prediction\n", + "df_result\n", + "\n", + "data_lake_account_name = 'purviewaccdl'\n", + "file_system_name = 'purviewaccfs'\n", + "df_sp = spark.createDataFrame(df_result)\n", + "df_sp.write.option('header', 'true').mode('overwrite').csv(synapse_base_path + '/creditriskdata/batchpredictions/')\n", + "\n", + "df_sp.write.mode(\"overwrite\").saveAsTable(\"default.creditrisk_predictions\")" + ], + "outputs": [], + "execution_count": 17, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "#create an entity for test data\n", + "test_data_ent_name = 'creditrisktestdata'\n", + "create_data_entity_with_schema(df_test,test_data_ent_name,entitytype='custom_dataset')\n", + "\n", + "#create an entity for batch inference data\n", + "batchpred_data_ent_name = 'creditriskbatchpredictions'\n", + "create_data_entity_with_schema(df_result,batchpred_data_ent_name,entitytype='custom_dataset')\n", + "\n", + "#create batch inference lineage \n", + "syn_basepath = 'https://' + data_lake_account_name + 'dfs.core.windows.net' + file_system_name + '/'\n", + "pbasepath = 'pyapacheatlas://'\n", + "\n", + "in_ent_qns = {pbasepath + test_data_ent_name:'custom_dataset',pbasepath + model_name:'custom_ml_model'}\n", + "out_ent_qns = {pbasepath + batchpred_data_ent_name:'custom_dataset'}\n", + "\n", + "processname = '-batchinference'\n", + "create_lineage_for_entities(experimentname,processname, in_ent_qns,out_ent_qns,ColumnMapping=True)" + ], + "outputs": [], + "execution_count": 22, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "## uncomment below code to link PowerBI Dataset and Report in lineage if you have access to a PBI workspace " + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "# #The PowerBI entities will populate with more details if you set up a scan for PBI workspaces in Purview\n", + "# #We are only creating placeholders and links for lineage below\n", + "\n", + "# #create PowerBI dataset entity and lineage \n", + "# pbi_workspace = '' #'https://xxx.powerbi.com/groups/7c555287-f9b8-45ff-be6c-9909afe9df40'\n", + "# pbi_datasetid = '' #'c4a30c22-466d-4a30-a1ac-8736ed6567cc' \n", + "\n", + "# pbidata_ent_name = 'creditriskpbidataset' \n", + "\n", + "# create_powerbi_dataset_and_lineage(experimentname,pbi_workspace,pbi_datasetid,pbidata_ent_name,batchpred_data_ent_name,'custom_dataset')" + ], + "outputs": [], + "execution_count": 23, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "# #create PowerBI report entity and lineage\n", + "# pbi_reportid = '' #'e495453d-6c0c-4fb9-bdc4-556319f6a57b'\n", + "# pbi_ent_name = 'creditriskpbireport'\n", + " \n", + "# create_powerbi_report_and_lineage(experimentname,pbi_workspace,pbi_reportid,pbi_ent_name,pbi_datasetid)" + ], + "outputs": [], + "execution_count": 24, + "metadata": {} } - }, - "outputs": [], - "source": [ - "#batch inferencing\n", - "df_test = spark.read.load(synapse_base_path +'/creditriskdata/testdata', format='csv', header=True).toPandas()\n", - "\n", - "drop_cols = ['memberId', 'loanId', 'date','grade']\n", - "df_test1 = df_test.drop(drop_cols, axis=1)\n", - "\n", - "model_path = Model.get_model_path(model_name = 'creditrisk_model')\n", - "model = joblib.load(model_path)\n", - "\n", - "prediction = model.predict(df_test1)\n", - "prediction\n", - "\n", - "df_result = df_test \n", - "df_result['prediction'] = prediction\n", - "df_result\n", - "\n", - "data_lake_account_name = 'purviewaccdl'\n", - "file_system_name = 'purviewaccfs'\n", - "df_sp = spark.createDataFrame(df_result)\n", - "df_sp.write.option('header', 'true').mode('overwrite').csv(synapse_base_path + '/creditriskdata/batchpredictions/')\n", - "\n", - "df_sp.write.mode(\"overwrite\").saveAsTable(\"default.creditrisk_predictions\")" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + ], + "metadata": { + "kernelspec": { + "name": "synapse_pyspark", + "display_name": "python" }, - "nteract": { - "transient": { - "deleting": false - } + "language_info": { + "name": "python" + }, + "save_output": true, + "synapse_widget": { + "version": "0.1", + "state": {} } - }, - "outputs": [], - "source": [ - "#create an entity for test data\n", - "test_data_ent_name = 'creditrisktestdata'\n", - "create_data_entity_with_schema(df_test,test_data_ent_name,entitytype='custom_dataset')\n", - "\n", - "#create an entity for batch inference data\n", - "batchpred_data_ent_name = 'creditriskbatchpredictions'\n", - "create_data_entity_with_schema(df_result,batchpred_data_ent_name,entitytype='custom_dataset')\n", - "\n", - "#create batch inference lineage \n", - "syn_basepath = 'https://' + data_lake_account_name + 'dfs.core.windows.net' + file_system_name + '/'\n", - "pbasepath = 'pyapacheatlas://'\n", - "\n", - "in_ent_qns = {pbasepath + test_data_ent_name:'custom_dataset',pbasepath + model_name:'custom_ml_model'}\n", - "out_ent_qns = {pbasepath + batchpred_data_ent_name:'custom_dataset'}\n", - "\n", - "processname = '-batchinference'\n", - "create_lineage_for_entities(experimentname,processname, in_ent_qns,out_ent_qns,ColumnMapping=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## uncomment below code to link PowerBI Dataset and Report in lineage if you have access to a PBI workspace " - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "# #The PowerBI entities will populate with more details if you set up a scan for PBI workspaces in Purview\n", - "# #We are only creating placeholders and links for lineage below\n", - "\n", - "# #create PowerBI dataset entity and lineage \n", - "# pbi_workspace = '' #'https://xxx.powerbi.com/groups/7c555287-f9b8-45ff-be6c-9909afe9df40'\n", - "# pbi_datasetid = '' #'c4a30c22-466d-4a30-a1ac-8736ed6567cc' \n", - "\n", - "# pbidata_ent_name = 'creditriskpbidataset' \n", - "\n", - "# create_powerbi_dataset_and_lineage(experimentname,pbi_workspace,pbi_datasetid,pbidata_ent_name,batchpred_data_ent_name,'custom_dataset')" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "# #create PowerBI report entity and lineage\n", - "# pbi_reportid = '' #'e495453d-6c0c-4fb9-bdc4-556319f6a57b'\n", - "# pbi_ent_name = 'creditriskpbireport'\n", - " \n", - "# create_powerbi_report_and_lineage(experimentname,pbi_workspace,pbi_reportid,pbi_ent_name,pbi_datasetid)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" }, - "save_output": true - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file