From f57792b320207fb0c7a485ef9d193fa0655e81fa Mon Sep 17 00:00:00 2001 From: Christopher Glenn Date: Sun, 17 Apr 2022 00:45:30 -0400 Subject: [PATCH 1/8] Genesis --- .gitignore | 2 + README.md | 7 +- setup.py | 14 ++ simple_graph_etl/__init__.py | 0 simple_graph_etl/documentlibrary.py | 44 +++++++ simple_graph_etl/simpleetl.py | 190 ++++++++++++++++++++++++++++ tests/__init__.py | 0 tests/tests.py | 0 8 files changed, 256 insertions(+), 1 deletion(-) create mode 100644 .gitignore create mode 100644 setup.py create mode 100644 simple_graph_etl/__init__.py create mode 100644 simple_graph_etl/documentlibrary.py create mode 100644 simple_graph_etl/simpleetl.py create mode 100644 tests/__init__.py create mode 100644 tests/tests.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..50a19c6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.env +venv \ No newline at end of file diff --git a/README.md b/README.md index ab00ea4..32c8721 100644 --- a/README.md +++ b/README.md @@ -1 +1,6 @@ -# simple-graph +# simple-graph-etl +"Minimal wrapper lib for Python ETLs using Microsoft's Graph API" + +## TODO +Add readme :) +Add tests \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..be19648 --- /dev/null +++ b/setup.py @@ -0,0 +1,14 @@ +from setuptools import find_packages, setup + +setup( + name='simple_graph_etl', + packages=find_packages(include=['simple_graph_etl']), + version='1.0.0', + description="Minimal wrapper lib for Python ETLs using Microsoft's Graph API", + author='glennpai / chglenn20@gmail.com', + license='MIT', + install_requires=['msal', 'requests'], + setup_requires=['pytest-runner'], + tests_require=['pytest==4.4.1'], + test_suite='tests', +) diff --git a/simple_graph_etl/__init__.py b/simple_graph_etl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/simple_graph_etl/documentlibrary.py b/simple_graph_etl/documentlibrary.py new file mode 100644 index 0000000..3da9b33 --- /dev/null +++ b/simple_graph_etl/documentlibrary.py @@ -0,0 +1,44 @@ +""" +Module to unify and simplify configuration of a SharePoint document library for use in +a Python ETL +""" +class DocumentLibrary: + """ + A class containing configuration for accessing a SharePoint document library + via the Graph API + + Attributes: + client_id (string): Client ID for Azure Active Directory subscription + site_id (string): Site ID for a library's parent SharePoint site + res_id (string): Resource ID of a SharePoint document library + authority (string): Authority string for an Azure app registration + scope (string): Permission scopes of the user authenticating to the Azure app registration + base_url (string): Base URL of the document library formed from the Site and Res IDs + """ + # Pylint(R0913:too-many-arguments) + # Ignoring in interest of keeping config flat and consistent + def __init__(self, client_id, site_id, res_id, authority, scope): + self.client_id = client_id + self.site_id = site_id + self.res_id = res_id + self.authority = authority + self.scope = scope + self.base_url = self.get_base_url() + + + def __repr__(self): + return f'DocumentLibrary({self.client_id},{self.site_id},{self.res_id}, \ + {self.authority},{self.scope},{self.base_url}' + + + def get_base_url(self): + """ + Returns base URL used in most ETL functions via the Graph API + + Parameters: + + Returns: + URL string constructed from site and res IDs + """ + return f'https://graph.microsoft.com/v1.0/sites/ \ + {self.site_id}/drives/{self.res_id}' diff --git a/simple_graph_etl/simpleetl.py b/simple_graph_etl/simpleetl.py new file mode 100644 index 0000000..b733d1d --- /dev/null +++ b/simple_graph_etl/simpleetl.py @@ -0,0 +1,190 @@ +""" +Module to simplify basic Python ETL interactions with a SharePoint document library +""" +from os import path +import msal +import requests + +class SimpleETL: + """ + A class to simplify ETL functions perfomed on Azure app registrations and SharePoint + document libraries via the Graph API + + Class constructor accepts a DocumentLibrary instance and the required authentication + configuration + + Attributes: + library (DocumentLibrary): SharePoint document library configuration + __thumbprint [private] (string): Hash of signed certificate used when authenticating to the + Azure app registration + __private_key [private] (string): Private key used to authenticate to the Azure app + registration + __token [private] (string): Authentication token acquired from Azure app registration + """ + def __init__(self, document_library, thumbprint, private_key): + self.library = document_library + self.__thumbprint = thumbprint + self.__private_key = private_key + self.__token = self.__acquire_token() + + + @staticmethod + def __get_item_id(file_items, target_name): + """ + Gets item ID value from a file object if its name matches the target name + + Parameters: + file_items (any[]): List of file objects to check + target_name (string): Name to search for in list of file objects + Returns: + item_id (string): URL string constructed from site and res IDs + """ + item_id = '' + + for item in file_items: + if item['name'] == target_name: + item_id = item['id'] + + return item_id + + + def __acquire_token(self): + """ + Authenticates against Azure app registration to get an auth token used for + calls to the Graph API + + Parameters: + + Returns: + result['access_token'] (string): String value of auth token + """ + app = msal.ConfidentialClientApplication( + self.library.client_id, + authority=self.library.authority, + client_credential={'thumbprint': self.__thumbprint, 'private_key': self.__private_key}, + ) + + result = None + result = app.acquire_token_silent([self.library.scope], account=None) + + if not result: + result = app.acquire_token_for_client(scopes=[self.library.scope]) + + if 'access_token' in result: + return result['access_token'] + + raise Exception(result.get('error')) + + + def fetch(self, source_path): + """ + Gets a list of files that are children to the source_path directory + + Parameters: + source_path (string): Path to parent directory containing target files + Returns: + file_list (any[]): List of file objects retrieved from source_path + """ + file_list = [] + + file_list_resp = requests.get(f'{self.library.base_url}/root:/{source_path}:/children', + headers={'Authorization': 'Bearer ' + self.__token}) + + if file_list_resp.status_code == 200: + objs = file_list_resp.json()['value'] + + for obj in objs: + if '@microsoft.graph.downloadUrl' not in obj: + continue + + file_data = requests.get(obj['@microsoft.graph.downloadUrl']) + + if file_data.status_code == 200: + try: + file = open(obj['name'], 'wb') + file.write(file_data.content) + file_list.append(obj['name']) + except Exception as err: + raise f'Failed to write file data. {err}' + else: + raise Exception(f'Bad response fetching file "{obj["name"]}".' + + f'{file_data.raise_for_status()}') + + else: + raise Exception('Bad response from the remote host.' + + f'{file_list_resp.raise_for_status()}') + + return file_list + + + def delete(self, dest_path, file_name): + """ + Deletes a remote file from a SharePoint document library based on file path + and name + + Parameters: + dest_path (string): Remote path of parent directory of file to delete + file_name (string): Name of remote file to delete + Returns: + """ + list_url = f'{self.library.base_url}/root:/{dest_path}:/children' + + file_list_response = requests.get(list_url, + headers={'Authorization': 'Bearer ' + self.__token}) + + if file_list_response.status_code == 200: + item_id = self.__get_item_id(file_list_response.json()['value'], file_name) + + if item_id != '': + delete_url = f'{self.library.base_url}/items/' + + delete_response = requests.delete(delete_url + item_id, + headers={'Authorization': 'Bearer ' + self.__token}) + + if delete_response.status_code != 204: + raise Exception(f'Failed to delete {file_name}. \ + {delete_response.raise_for_status()}') + + else: + raise Exception(f'Failed to fetch item info for {file_name}') + + else: + raise Exception(f'Failed to fetch file list from {dest_path}. \ + {file_list_response.raise_for_status()}') + + + def upload(self, dest_path, file_name): + """ + Uploads a local file to a SharePoint document library + + Parameters: + dest_path (string): Remote path of parent directory of file to upload + file_name (string): Name of local file to upload + Returns: + """ + upload_session = requests.post(f'{self.library.base_url}/root:/ \ + {dest_path}/{file_name}:/createUploadSession', + headers={'Authorization': 'Bearer ' + self.__token}) + + if upload_session.status_code == 200: + upload_url = upload_session.json()['uploadUrl'] + + try: + file = open(f'{file_name}', 'rb') + file_size = path.getsize(file_name) + + # Content length and content range are required headers. + # File data (bytes) is sent in body. + upload_response = requests.put(upload_url, + headers={'Content-Length': f'{file_size}', + 'Content-Range': f'bytes 0-{file_size - 1}/{file_size}'}, + data=file) + + if upload_response.status_code != 201: + raise Exception(upload_response.raise_for_status()) + + except Exception as err: + raise f'Failed to upload file to upload URL. {err}' + + else: + raise Exception(f'Error retrieving upload URL. {upload_session.raise_for_status()}') diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/tests.py b/tests/tests.py new file mode 100644 index 0000000..e69de29 From be973b5b0cab66f4362ee1aa385ba0073f0a11e1 Mon Sep 17 00:00:00 2001 From: Christopher Glenn <60518533+glennpai@users.noreply.github.com> Date: Sun, 17 Apr 2022 00:46:35 -0400 Subject: [PATCH 2/8] Update README.md --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 32c8721..0af982c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # simple-graph-etl -"Minimal wrapper lib for Python ETLs using Microsoft's Graph API" + +Minimal wrapper lib for Python ETLs using Microsoft's Graph API ## TODO -Add readme :) -Add tests \ No newline at end of file + +Add readme :) +Add tests From e97ad507e2639c9cd5f6979be0049195e772caf6 Mon Sep 17 00:00:00 2001 From: Christopher Glenn <60518533+glennpai@users.noreply.github.com> Date: Mon, 18 Apr 2022 12:44:45 -0400 Subject: [PATCH 3/8] Update README.md --- README.md | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0af982c..131f10e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,47 @@ Minimal wrapper lib for Python ETLs using Microsoft's Graph API +Designed with intent for use in Ohio University Python scripts interacting with the Graph API + +## Example +Example document library structure: +``` +remote +└──dir + └──path + └──ExampleFile.txt +``` + +Example ETL: +```Python +import simple_graph_etl as sge + +documentLibrary = sge.documentlibrary( + client_id = 'some client ID', + site_id = 'some site ID', + res_id = 'some res ID', + authority = 'some authority', + scope = 'some scope' + ) + +connection = sge.simpleetl( + library = documentLibrary, + thumbprint = 'some thumbprint', + private_key = 'some private key' +) + +files = connection.fetch('/remote/dir/path') // Create local copies of child files at specified remote path + +transform_file('ExampleFile.txt') // Transform local file + +connection.delete('/remote/dir/path', 'ExampleFile.txt') // Delete remote copy of file as it will be replaced + +connection.upload('/remote/dir/path', 'ExampleFile.txt') // Upload local copy of file to same location as original + +``` + ## TODO -Add readme :) Add tests +Peer review +Publish From 0800cd4e3143c69182c837972a6de486a24e926b Mon Sep 17 00:00:00 2001 From: Christopher Glenn <60518533+glennpai@users.noreply.github.com> Date: Mon, 18 Apr 2022 12:50:54 -0400 Subject: [PATCH 4/8] Update README.md --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 131f10e..00f22e0 100644 --- a/README.md +++ b/README.md @@ -31,13 +31,13 @@ connection = sge.simpleetl( private_key = 'some private key' ) -files = connection.fetch('/remote/dir/path') // Create local copies of child files at specified remote path +files = connection.fetch('/remote/dir/path') # Create local copies of child files at specified remote path -transform_file('ExampleFile.txt') // Transform local file +transform_file('ExampleFile.txt') # Transform local file -connection.delete('/remote/dir/path', 'ExampleFile.txt') // Delete remote copy of file as it will be replaced +connection.delete('/remote/dir/path', 'ExampleFile.txt') # Delete remote copy of file as it will be replaced -connection.upload('/remote/dir/path', 'ExampleFile.txt') // Upload local copy of file to same location as original +connection.upload('/remote/dir/path', 'ExampleFile.txt') # Upload local copy of file to same location as original ``` From 23566bbb1cf2e1e94eec85835a446e1990746da3 Mon Sep 17 00:00:00 2001 From: Christopher Glenn <60518533+glennpai@users.noreply.github.com> Date: Mon, 18 Apr 2022 12:57:46 -0400 Subject: [PATCH 5/8] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 00f22e0..0a3ded8 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ connection = sge.simpleetl( private_key = 'some private key' ) -files = connection.fetch('/remote/dir/path') # Create local copies of child files at specified remote path +connection.fetch('/remote/dir/path') # Create local copies of child files at specified remote path transform_file('ExampleFile.txt') # Transform local file From fe410438405d2414de4ff92b12d1e5ea6696b4c7 Mon Sep 17 00:00:00 2001 From: Christopher Glenn Date: Mon, 18 Apr 2022 13:01:54 -0400 Subject: [PATCH 6/8] Update docstrings --- simple_graph_etl/simpleetl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/simple_graph_etl/simpleetl.py b/simple_graph_etl/simpleetl.py index b733d1d..c6026ff 100644 --- a/simple_graph_etl/simpleetl.py +++ b/simple_graph_etl/simpleetl.py @@ -83,7 +83,6 @@ def fetch(self, source_path): Parameters: source_path (string): Path to parent directory containing target files Returns: - file_list (any[]): List of file objects retrieved from source_path """ file_list = [] From 64632a5bb60b45277eb80b4b4f9cf3825c20583e Mon Sep 17 00:00:00 2001 From: Christopher Glenn Date: Mon, 18 Apr 2022 18:57:17 -0400 Subject: [PATCH 7/8] Add filenames function, create local path param --- README.md | 14 +++- setup.py | 2 +- simple_graph_etl/documentlibrary.py | 1 - simple_graph_etl/simpleetl.py | 109 +++++++++++++++------------- 4 files changed, 70 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md index 0a3ded8..83e05f6 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Minimal wrapper lib for Python ETLs using Microsoft's Graph API Designed with intent for use in Ohio University Python scripts interacting with the Graph API ## Example -Example document library structure: +### Example document library structure: ``` remote └──dir @@ -13,11 +13,11 @@ remote └──ExampleFile.txt ``` -Example ETL: +### Example ETL: ```Python import simple_graph_etl as sge -documentLibrary = sge.documentlibrary( +documentLibrary = sge.DocumentLibrary( client_id = 'some client ID', site_id = 'some site ID', res_id = 'some res ID', @@ -25,7 +25,7 @@ documentLibrary = sge.documentlibrary( scope = 'some scope' ) -connection = sge.simpleetl( +connection = sge.SimpleETL( library = documentLibrary, thumbprint = 'some thumbprint', private_key = 'some private key' @@ -44,5 +44,11 @@ connection.upload('/remote/dir/path', 'ExampleFile.txt') # Upload local copy of ## TODO Add tests + Peer review + +Create detailed usage spec docs + Publish + +Move to enterprise space? diff --git a/setup.py b/setup.py index be19648..9402dfe 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name='simple_graph_etl', packages=find_packages(include=['simple_graph_etl']), - version='1.0.0', + version='1.1.0', description="Minimal wrapper lib for Python ETLs using Microsoft's Graph API", author='glennpai / chglenn20@gmail.com', license='MIT', diff --git a/simple_graph_etl/documentlibrary.py b/simple_graph_etl/documentlibrary.py index 3da9b33..c8b8c4e 100644 --- a/simple_graph_etl/documentlibrary.py +++ b/simple_graph_etl/documentlibrary.py @@ -36,7 +36,6 @@ def get_base_url(self): Returns base URL used in most ETL functions via the Graph API Parameters: - Returns: URL string constructed from site and res IDs """ diff --git a/simple_graph_etl/simpleetl.py b/simple_graph_etl/simpleetl.py index c6026ff..144dff4 100644 --- a/simple_graph_etl/simpleetl.py +++ b/simple_graph_etl/simpleetl.py @@ -1,7 +1,8 @@ """ Module to simplify basic Python ETL interactions with a SharePoint document library """ -from os import path +import os +import re import msal import requests @@ -37,7 +38,7 @@ def __get_item_id(file_items, target_name): file_items (any[]): List of file objects to check target_name (string): Name to search for in list of file objects Returns: - item_id (string): URL string constructed from site and res IDs + item_id (string): Item ID property value """ item_id = '' @@ -54,7 +55,6 @@ def __acquire_token(self): calls to the Graph API Parameters: - Returns: result['access_token'] (string): String value of auth token """ @@ -63,127 +63,136 @@ def __acquire_token(self): authority=self.library.authority, client_credential={'thumbprint': self.__thumbprint, 'private_key': self.__private_key}, ) - result = None result = app.acquire_token_silent([self.library.scope], account=None) if not result: result = app.acquire_token_for_client(scopes=[self.library.scope]) - if 'access_token' in result: return result['access_token'] raise Exception(result.get('error')) - def fetch(self, source_path): + def filenames(self, remote_path): """ - Gets a list of files that are children to the source_path directory + Gets a list of file names that are children to the remote_path directory + Useful for checking existence of a remote file Parameters: - source_path (string): Path to parent directory containing target files + remote_path (string): Path to parent directory containing target files Returns: + filenames (string[]): List of file names in the remote_path directory """ - file_list = [] - - file_list_resp = requests.get(f'{self.library.base_url}/root:/{source_path}:/children', + filenames = [] + file_list_resp = requests.get(f'{self.library.base_url}/root:/{remote_path}:/children', headers={'Authorization': 'Bearer ' + self.__token}) if file_list_resp.status_code == 200: objs = file_list_resp.json()['value'] + for obj in objs: + if obj['file']: + filenames.append(obj['name']) + else: + raise Exception('Bad response from the remote host.' + + f'{file_list_resp.raise_for_status()}') + + return filenames + + def fetch(self, remote_path, local_path='.'): + """ + Creates a local copy of files contained in the document library at the remote_path + + Parameters: + remote_path (string): Path to parent directory containing target files + local_path (string): Path to local directory where files will be written - Default '.' + Returns: + """ + file_list_resp = requests.get(f'{self.library.base_url}/root:/{remote_path}:/children', + headers={'Authorization': 'Bearer ' + self.__token}) + if file_list_resp.status_code == 200: + objs = file_list_resp.json()['value'] for obj in objs: - if '@microsoft.graph.downloadUrl' not in obj: + if not obj['file']: continue - file_data = requests.get(obj['@microsoft.graph.downloadUrl']) - if file_data.status_code == 200: try: - file = open(obj['name'], 'wb') - file.write(file_data.content) - file_list.append(obj['name']) + clean_path = re.sub(r'^(\\|\/)+|(\\|\/)+$', '', local_path) + if not os.path.exists(clean_path): + os.makedirs(clean_path) + with open(os.path.join(clean_path, obj['name']), 'wb') as file: + file.write(file_data.content) except Exception as err: raise f'Failed to write file data. {err}' else: raise Exception(f'Bad response fetching file "{obj["name"]}".' + f'{file_data.raise_for_status()}') - else: raise Exception('Bad response from the remote host.' + f'{file_list_resp.raise_for_status()}') - return file_list - - def delete(self, dest_path, file_name): + def delete(self, remote_path, file_name): """ Deletes a remote file from a SharePoint document library based on file path and name Parameters: - dest_path (string): Remote path of parent directory of file to delete + remote_path (string): Remote path of parent directory of file to delete file_name (string): Name of remote file to delete Returns: """ - list_url = f'{self.library.base_url}/root:/{dest_path}:/children' - + list_url = f'{self.library.base_url}/root:/{remote_path}:/children' file_list_response = requests.get(list_url, headers={'Authorization': 'Bearer ' + self.__token}) if file_list_response.status_code == 200: item_id = self.__get_item_id(file_list_response.json()['value'], file_name) - if item_id != '': delete_url = f'{self.library.base_url}/items/' - delete_response = requests.delete(delete_url + item_id, headers={'Authorization': 'Bearer ' + self.__token}) - if delete_response.status_code != 204: raise Exception(f'Failed to delete {file_name}. \ {delete_response.raise_for_status()}') - else: raise Exception(f'Failed to fetch item info for {file_name}') - else: - raise Exception(f'Failed to fetch file list from {dest_path}. \ + raise Exception(f'Failed to fetch file list from {remote_path}. \ {file_list_response.raise_for_status()}') - def upload(self, dest_path, file_name): + def upload(self, file_name, remote_path, local_path='.'): """ - Uploads a local file to a SharePoint document library + Uploads a local file to a SharePoint document library at a specified remote_path Parameters: - dest_path (string): Remote path of parent directory of file to upload - file_name (string): Name of local file to upload + local_file (string): Local file name and format + remote_path (string): Remote path of parent directory of file to upload + local_path (string): Local path to file - Default '.' Returns: """ upload_session = requests.post(f'{self.library.base_url}/root:/ \ - {dest_path}/{file_name}:/createUploadSession', + {remote_path}/{file_name}:/createUploadSession', headers={'Authorization': 'Bearer ' + self.__token}) if upload_session.status_code == 200: upload_url = upload_session.json()['uploadUrl'] - try: - file = open(f'{file_name}', 'rb') - file_size = path.getsize(file_name) - - # Content length and content range are required headers. - # File data (bytes) is sent in body. - upload_response = requests.put(upload_url, - headers={'Content-Length': f'{file_size}', - 'Content-Range': f'bytes 0-{file_size - 1}/{file_size}'}, - data=file) - - if upload_response.status_code != 201: - raise Exception(upload_response.raise_for_status()) - + full_local = os.path.join(local_path, file_name) + with open(full_local, 'rb') as file: + file_size = os.path.getsize(full_local) + # Content length and content range are required headers. + # File data (bytes) is sent in body. + upload_response = requests.put(upload_url, + headers={'Content-Length': f'{file_size}', + 'Content-Range': f'bytes 0-{file_size - 1}/{file_size}'}, + data=file) + if upload_response.status_code != 201: + raise Exception(upload_response.raise_for_status()) except Exception as err: raise f'Failed to upload file to upload URL. {err}' - else: raise Exception(f'Error retrieving upload URL. {upload_session.raise_for_status()}') From cdaf4d43d76241484b315c503bbf14c51aea2c45 Mon Sep 17 00:00:00 2001 From: Christopher Glenn <60518533+glennpai@users.noreply.github.com> Date: Mon, 25 Apr 2022 19:53:20 -0400 Subject: [PATCH 8/8] Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 83e05f6..d4c3e0a 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,10 @@ # simple-graph-etl +## NOTE +This project has been moved into Ohio University's GitHub Enterprise project space. As such, this repo will not be updated as frequently until the full release of the package. It will then be cloned back to this repo for historical / showcase purposes. + +--- + Minimal wrapper lib for Python ETLs using Microsoft's Graph API Designed with intent for use in Ohio University Python scripts interacting with the Graph API