Python + Azure Database for PostgreSQL
Published Sep 06 2019 02:19 PM 8,366 Views
Microsoft

Azure Database for PostgreSQL and Python can be used together for data analysis – PostgreSQL as database engine and Python as statistical tool. When dealing with large datasets that potentially exceed the memory of your machine it is recommended to push the data into database engine, where you can query the data in smaller digestible chunks.

 

pypg.jpg

 

In this article we will learn how to use Python to perform the following tasks:

 

  • Create Azure Database for PostgreSQL using azure python sdk
  • Connect to Azure Database for PostgreSQL using psycopg2
  • Create databases and tables
  • Load data from pandas dataframe into a table
  • Query data from table
  • Visualize data from table using plotnine
  • Delete Azure Database for PostgreSQL server using azure python sdk

 

 

 

 

# pip install azure plotnine passgen psycopg2 adal
from azure.mgmt.resource import ResourceManagementClient
from msrestazure.azure_active_directory import AADTokenCredentials
from azure.mgmt.resource.resources.models import DeploymentMode
import adal, json, requests, psycopg2, passgen, warnings
from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap
from plotnine.data import mtcars
from io import StringIO
import pandas.io.sql as sqlio

def authenticate_device_code():
    # Replace values with your client and tenant id
    tenant = '00000000-0000-0000-0000-000000000000'
    client_id = '00000000-0000-0000-0000-000000000000'
    authority_host_uri = 'https://login.microsoftonline.com'
    authority_uri = authority_host_uri + '/' + tenant
    resource_uri = 'https://management.core.windows.net/'
    
    context = adal.AuthenticationContext(authority_uri, api_version=None)
    code = context.acquire_user_code(resource_uri, client_id)
    print(code['message'])
    mgmt_token = context.acquire_token_with_device_code(resource_uri, code, client_id)
    credentials = AADTokenCredentials(mgmt_token, client_id)
    return credentials

if __name__ == '__main__':
    
    # Initialize parameters
    # Replace with your subscription id
    subscription_id = "00000000-0000-0000-0000-000000000000"
    resource_group = "test_group"
    location = "southcentralus"
    pg_username = "azureuser"
    pg_password = passgen.passgen(length=12, punctuation=False, digits=True, letters=True, case='both')
    pg_servername = "testserver"

    # Other ways to obtain credentials : https://github.com/Azure-Samples/data-lake-analytics-python-auth-options/blob/master/sample.py
    credentials = authenticate_device_code()
    client = ResourceManagementClient(credentials, subscription_id)

    # Create Resource group
    print("\nCreating Resource Group","\n")
    client.resource_groups.create_or_update(resource_group, {"location": location})

    # Create PostgreSQL Server using ARM template
    print("Creating Azure Database for PostgreSQL Server","\n")
    template = json.loads(requests.get("https://raw.githubusercontent.com/Azure/azure-postgresql/master/arm-templates/ExampleWithFirewallRul...)
    parameters = {
        'administratorLogin': pg_username,
        'administratorLoginPassword': pg_password,
        'location': location,
        'serverName': pg_servername,
        'skuCapacity': 2,
        'skuFamily': 'Gen5',
        'skuName': 'GP_Gen5_2',
        'skuSizeMB': 51200,
        'skuTier': 'GeneralPurpose',
        'version': '10',
        'backupRetentionDays': 7,
        'geoRedundantBackup': 'Disabled'
    }
    parameters = {k: {'value': v} for k, v in parameters.items()}
    deployment_properties = {
        'mode': DeploymentMode.incremental,
        'template': template,
        'parameters': parameters
    }
    deployment_async_operation = client.deployments.create_or_update(
        resource_group,
        'azure-pg-sample',
        deployment_properties
    )
    deployment_async_operation.wait()
    
    # Connect to postgres database using psycopg2
    print("Connecting to Azure Database for PostgreSQL Server","\n")
    try:
        connection = psycopg2.connect(user = pg_username + '@' + pg_servername,
                                    password = pg_password,
                                    host = pg_servername + ".postgres.database.azure.com",
                                    port = "5432",
                                    database = "postgres")
        connection.autocommit = True
        cursor = connection.cursor()
        
        # Print PostgreSQL version
        cursor.execute("SELECT version();")
        record = cursor.fetchone()
        print("You are connected to - ", record,"\n")

        # create database mtcars and connect to it
        print("Creating database mtcars","\n")
        cursor.execute("CREATE DATABASE mtcars")
        connection = psycopg2.connect(user = pg_username + '@' + pg_servername,
                                    password = pg_password,
                                    host = pg_servername + ".postgres.database.azure.com",
                                    port = "5432",
                                    database = "mtcars")
        connection.autocommit = True
        cursor = connection.cursor()

        # Create mtcars table
        print("Creating table mtcars","\n")
        create_table_query = '''CREATE TABLE mtcars (
            name VARCHAR(50) NOT NULL, 
            mpg FLOAT NOT NULL, 
            cyl INTEGER NOT NULL, 
            disp FLOAT NOT NULL, 
            hp INTEGER NOT NULL, 
            drat FLOAT NOT NULL, 
            wt FLOAT NOT NULL, 
            qsec FLOAT NOT NULL, 
            vs INTEGER NOT NULL, 
            am INTEGER NOT NULL, 
            gear INTEGER NOT NULL, 
            carb INTEGER NOT NULL
            );'''
        cursor.execute(create_table_query)

        # Load mtcars data from pandas dataframe
        print("Loading data into table mtcars","\n")
        sio = StringIO()
        sio.write(mtcars.to_csv(index=None, header=None))  # Write the Pandas DataFrame as a csv to the buffer
        sio.seek(0)  # Be sure to reset the position to the start of the stream
        cursor.copy_from(sio, "mtcars", columns=mtcars.columns, sep=',')

        # read mtcars data from Postgresql into a pandas dataframe
        print("Reading data from mtcars table into a pandas dataframe","\n")
        mtcars_data = sqlio.read_sql_query("select * from mtcars", connection)

        # visualize the data using ggplot
        print("Visualizing data from mtcars table","\n")
        plot = (ggplot(mtcars_data, aes('wt', 'mpg', color='factor(gear)'))
        + geom_point()
        + stat_smooth(method='lm')
        + facet_wrap('~gear'))

        # We run this to suppress various deprecation warnings from plotnine - keeps our output cleaner
        warnings.filterwarnings('ignore')

        # Save the plot as pdf file
        print("Saving plot as PDF file","\n")
        plot.save("mtcars.pdf")

    except (Exception, psycopg2.Error) as error :
        print ("Error while connecting to PostgreSQL", error)
    finally:
        #closing database connection.
            if(connection):
                cursor.close()
                connection.close()
                print("PostgreSQL connection is closed","\n")

    # Delete Resource group and everything in it
    print("Deleting Resource Group", "\n")
    delete_async_operation = client.resource_groups.delete(resource_group)
    delete_async_operation.wait()
    print("Deleted: {}\n".format(resource_group))

 

 

 

 

Version history
Last update:
‎Sep 06 2019 02:19 PM
Updated by: