Azure Database for PostgreSQL and Python can be used together for data analysis – PostgreSQL as database engine and Python as statistical tool. When dealing with large datasets that potentially exceed the memory of your machine it is recommended to push the data into database engine, where you can query the data in smaller digestible chunks.
In this article we will learn how to use Python to perform the following tasks:
- Create Azure Database for PostgreSQL using azure python sdk
- Connect to Azure Database for PostgreSQL using psycopg2
- Create databases and tables
- Load data from pandas dataframe into a table
- Query data from table
- Visualize data from table using plotnine
- Delete Azure Database for PostgreSQL server using azure python sdk
# pip install azure plotnine passgen psycopg2 adal
from azure.mgmt.resource import ResourceManagementClient
from msrestazure.azure_active_directory import AADTokenCredentials
from azure.mgmt.resource.resources.models import DeploymentMode
import adal, json, requests, psycopg2, passgen, warnings
from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap
from plotnine.data import mtcars
from io import StringIO
import pandas.io.sql as sqlio
def authenticate_device_code():
# Replace values with your client and tenant id
tenant = '00000000-0000-0000-0000-000000000000'
client_id = '00000000-0000-0000-0000-000000000000'
authority_host_uri = 'https://login.microsoftonline.com'
authority_uri = authority_host_uri + '/' + tenant
resource_uri = 'https://management.core.windows.net/'
context = adal.AuthenticationContext(authority_uri, api_version=None)
code = context.acquire_user_code(resource_uri, client_id)
print(code['message'])
mgmt_token = context.acquire_token_with_device_code(resource_uri, code, client_id)
credentials = AADTokenCredentials(mgmt_token, client_id)
return credentials
if __name__ == '__main__':
# Initialize parameters
# Replace with your subscription id
subscription_id = "00000000-0000-0000-0000-000000000000"
resource_group = "test_group"
location = "southcentralus"
pg_username = "azureuser"
pg_password = passgen.passgen(length=12, punctuation=False, digits=True, letters=True, case='both')
pg_servername = "testserver"
# Other ways to obtain credentials : https://github.com/Azure-Samples/data-lake-analytics-python-auth-options/blob/master/sample.py
credentials = authenticate_device_code()
client = ResourceManagementClient(credentials, subscription_id)
# Create Resource group
print("\nCreating Resource Group","\n")
client.resource_groups.create_or_update(resource_group, {"location": location})
# Create PostgreSQL Server using ARM template
print("Creating Azure Database for PostgreSQL Server","\n")
template = json.loads(requests.get("https://raw.githubusercontent.com/Azure/azure-postgresql/master/arm-templates/ExampleWithFirewallRule/template.json").text)
parameters = {
'administratorLogin': pg_username,
'administratorLoginPassword': pg_password,
'location': location,
'serverName': pg_servername,
'skuCapacity': 2,
'skuFamily': 'Gen5',
'skuName': 'GP_Gen5_2',
'skuSizeMB': 51200,
'skuTier': 'GeneralPurpose',
'version': '10',
'backupRetentionDays': 7,
'geoRedundantBackup': 'Disabled'
}
parameters = {k: {'value': v} for k, v in parameters.items()}
deployment_properties = {
'mode': DeploymentMode.incremental,
'template': template,
'parameters': parameters
}
deployment_async_operation = client.deployments.create_or_update(
resource_group,
'azure-pg-sample',
deployment_properties
)
deployment_async_operation.wait()
# Connect to postgres database using psycopg2
print("Connecting to Azure Database for PostgreSQL Server","\n")
try:
connection = psycopg2.connect(user = pg_username + '@' + pg_servername,
password = pg_password,
host = pg_servername + ".postgres.database.azure.com",
port = "5432",
database = "postgres")
connection.autocommit = True
cursor = connection.cursor()
# Print PostgreSQL version
cursor.execute("SELECT version();")
record = cursor.fetchone()
print("You are connected to - ", record,"\n")
# create database mtcars and connect to it
print("Creating database mtcars","\n")
cursor.execute("CREATE DATABASE mtcars")
connection = psycopg2.connect(user = pg_username + '@' + pg_servername,
password = pg_password,
host = pg_servername + ".postgres.database.azure.com",
port = "5432",
database = "mtcars")
connection.autocommit = True
cursor = connection.cursor()
# Create mtcars table
print("Creating table mtcars","\n")
create_table_query = '''CREATE TABLE mtcars (
name VARCHAR(50) NOT NULL,
mpg FLOAT NOT NULL,
cyl INTEGER NOT NULL,
disp FLOAT NOT NULL,
hp INTEGER NOT NULL,
drat FLOAT NOT NULL,
wt FLOAT NOT NULL,
qsec FLOAT NOT NULL,
vs INTEGER NOT NULL,
am INTEGER NOT NULL,
gear INTEGER NOT NULL,
carb INTEGER NOT NULL
);'''
cursor.execute(create_table_query)
# Load mtcars data from pandas dataframe
print("Loading data into table mtcars","\n")
sio = StringIO()
sio.write(mtcars.to_csv(index=None, header=None)) # Write the Pandas DataFrame as a csv to the buffer
sio.seek(0) # Be sure to reset the position to the start of the stream
cursor.copy_from(sio, "mtcars", columns=mtcars.columns, sep=',')
# read mtcars data from Postgresql into a pandas dataframe
print("Reading data from mtcars table into a pandas dataframe","\n")
mtcars_data = sqlio.read_sql_query("select * from mtcars", connection)
# visualize the data using ggplot
print("Visualizing data from mtcars table","\n")
plot = (ggplot(mtcars_data, aes('wt', 'mpg', color='factor(gear)'))
+ geom_point()
+ stat_smooth(method='lm')
+ facet_wrap('~gear'))
# We run this to suppress various deprecation warnings from plotnine - keeps our output cleaner
warnings.filterwarnings('ignore')
# Save the plot as pdf file
print("Saving plot as PDF file","\n")
plot.save("mtcars.pdf")
except (Exception, psycopg2.Error) as error :
print ("Error while connecting to PostgreSQL", error)
finally:
#closing database connection.
if(connection):
cursor.close()
connection.close()
print("PostgreSQL connection is closed","\n")
# Delete Resource group and everything in it
print("Deleting Resource Group", "\n")
delete_async_operation = client.resource_groups.delete(resource_group)
delete_async_operation.wait()
print("Deleted: {}\n".format(resource_group))
Updated Sep 06, 2019
Version 1.0ramkychan
Microsoft
Joined April 05, 2019
Azure Database for PostgreSQL Blog
Follow this blog board to get notified when there's new activity