Python + Azure Database for PostgreSQL

Microsoft

Sep 06, 2019

Azure Database for PostgreSQL and Python can be used together for data analysis – PostgreSQL as database engine and Python as statistical tool. When dealing with large datasets that potentially exceed the memory of your machine it is recommended to push the data into database engine, where you can query the data in smaller digestible chunks.

In this article we will learn how to use Python to perform the following tasks:

Create Azure Database for PostgreSQL using azure python sdk
Connect to Azure Database for PostgreSQL using psycopg2
Create databases and tables
Load data from pandas dataframe into a table
Query data from table
Visualize data from table using plotnine
Delete Azure Database for PostgreSQL server using azure python sdk

# pip install azure plotnine passgen psycopg2 adal
from azure.mgmt.resource import ResourceManagementClient
from msrestazure.azure_active_directory import AADTokenCredentials
from azure.mgmt.resource.resources.models import DeploymentMode
import adal, json, requests, psycopg2, passgen, warnings
from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap
from plotnine.data import mtcars
from io import StringIO
import pandas.io.sql as sqlio

def authenticate_device_code():
    # Replace values with your client and tenant id
    tenant = '00000000-0000-0000-0000-000000000000'
    client_id = '00000000-0000-0000-0000-000000000000'
    authority_host_uri = 'https://login.microsoftonline.com'
    authority_uri = authority_host_uri + '/' + tenant
    resource_uri = 'https://management.core.windows.net/'
    
    context = adal.AuthenticationContext(authority_uri, api_version=None)
    code = context.acquire_user_code(resource_uri, client_id)
    print(code['message'])
    mgmt_token = context.acquire_token_with_device_code(resource_uri, code, client_id)
    credentials = AADTokenCredentials(mgmt_token, client_id)
    return credentials

if __name__ == '__main__':
    
    # Initialize parameters
    # Replace with your subscription id
    subscription_id = "00000000-0000-0000-0000-000000000000"
    resource_group = "test_group"
    location = "southcentralus"
    pg_username = "azureuser"
    pg_password = passgen.passgen(length=12, punctuation=False, digits=True, letters=True, case='both')
    pg_servername = "testserver"

    # Other ways to obtain credentials : https://github.com/Azure-Samples/data-lake-analytics-python-auth-options/blob/master/sample.py
    credentials = authenticate_device_code()
    client = ResourceManagementClient(credentials, subscription_id)

    # Create Resource group
    print("\nCreating Resource Group","\n")
    client.resource_groups.create_or_update(resource_group, {"location": location})

    # Create PostgreSQL Server using ARM template
    print("Creating Azure Database for PostgreSQL Server","\n")
    template = json.loads(requests.get("https://raw.githubusercontent.com/Azure/azure-postgresql/master/arm-templates/ExampleWithFirewallRule/template.json").text)
    parameters = {
        'administratorLogin': pg_username,
        'administratorLoginPassword': pg_password,
        'location': location,
        'serverName': pg_servername,
        'skuCapacity': 2,
        'skuFamily': 'Gen5',
        'skuName': 'GP_Gen5_2',
        'skuSizeMB': 51200,
        'skuTier': 'GeneralPurpose',
        'version': '10',
        'backupRetentionDays': 7,
        'geoRedundantBackup': 'Disabled'
    }
    parameters = {k: {'value': v} for k, v in parameters.items()}
    deployment_properties = {
        'mode': DeploymentMode.incremental,
        'template': template,
        'parameters': parameters
    }
    deployment_async_operation = client.deployments.create_or_update(
        resource_group,
        'azure-pg-sample',
        deployment_properties
    )
    deployment_async_operation.wait()
    
    # Connect to postgres database using psycopg2
    print("Connecting to Azure Database for PostgreSQL Server","\n")
    try:
        connection = psycopg2.connect(user = pg_username + '@' + pg_servername,
                                    password = pg_password,
                                    host = pg_servername + ".postgres.database.azure.com",
                                    port = "5432",
                                    database = "postgres")
        connection.autocommit = True
        cursor = connection.cursor()
        
        # Print PostgreSQL version
        cursor.execute("SELECT version();")
        record = cursor.fetchone()
        print("You are connected to - ", record,"\n")

        # create database mtcars and connect to it
        print("Creating database mtcars","\n")
        cursor.execute("CREATE DATABASE mtcars")
        connection = psycopg2.connect(user = pg_username + '@' + pg_servername,
                                    password = pg_password,
                                    host = pg_servername + ".postgres.database.azure.com",
                                    port = "5432",
                                    database = "mtcars")
        connection.autocommit = True
        cursor = connection.cursor()

        # Create mtcars table
        print("Creating table mtcars","\n")
        create_table_query = '''CREATE TABLE mtcars (
            name VARCHAR(50) NOT NULL, 
            mpg FLOAT NOT NULL, 
            cyl INTEGER NOT NULL, 
            disp FLOAT NOT NULL, 
            hp INTEGER NOT NULL, 
            drat FLOAT NOT NULL, 
            wt FLOAT NOT NULL, 
            qsec FLOAT NOT NULL, 
            vs INTEGER NOT NULL, 
            am INTEGER NOT NULL, 
            gear INTEGER NOT NULL, 
            carb INTEGER NOT NULL
            );'''
        cursor.execute(create_table_query)

        # Load mtcars data from pandas dataframe
        print("Loading data into table mtcars","\n")
        sio = StringIO()
        sio.write(mtcars.to_csv(index=None, header=None))  # Write the Pandas DataFrame as a csv to the buffer
        sio.seek(0)  # Be sure to reset the position to the start of the stream
        cursor.copy_from(sio, "mtcars", columns=mtcars.columns, sep=',')

        # read mtcars data from Postgresql into a pandas dataframe
        print("Reading data from mtcars table into a pandas dataframe","\n")
        mtcars_data = sqlio.read_sql_query("select * from mtcars", connection)

        # visualize the data using ggplot
        print("Visualizing data from mtcars table","\n")
        plot = (ggplot(mtcars_data, aes('wt', 'mpg', color='factor(gear)'))
        + geom_point()
        + stat_smooth(method='lm')
        + facet_wrap('~gear'))

        # We run this to suppress various deprecation warnings from plotnine - keeps our output cleaner
        warnings.filterwarnings('ignore')

        # Save the plot as pdf file
        print("Saving plot as PDF file","\n")
        plot.save("mtcars.pdf")

    except (Exception, psycopg2.Error) as error :
        print ("Error while connecting to PostgreSQL", error)
    finally:
        #closing database connection.
            if(connection):
                cursor.close()
                connection.close()
                print("PostgreSQL connection is closed","\n")

    # Delete Resource group and everything in it
    print("Deleting Resource Group", "\n")
    delete_async_operation = client.resource_groups.delete(resource_group)
    delete_async_operation.wait()
    print("Deleted: {}\n".format(resource_group))

Updated Sep 06, 2019

Version 1.0

Microsoft

Joined April 05, 2019

View Profile