diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..50a19c6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.env +venv \ No newline at end of file diff --git a/README.md b/README.md index ab00ea4..d4c3e0a 100644 --- a/README.md +++ b/README.md @@ -1 +1,59 @@ -# simple-graph +# simple-graph-etl + +## NOTE +This project has been moved into Ohio University's GitHub Enterprise project space. As such, this repo will not be updated as frequently until the full release of the package. It will then be cloned back to this repo for historical / showcase purposes. + +--- + +Minimal wrapper lib for Python ETLs using Microsoft's Graph API + +Designed with intent for use in Ohio University Python scripts interacting with the Graph API + +## Example +### Example document library structure: +``` +remote +└──dir + └──path + └──ExampleFile.txt +``` + +### Example ETL: +```Python +import simple_graph_etl as sge + +documentLibrary = sge.DocumentLibrary( + client_id = 'some client ID', + site_id = 'some site ID', + res_id = 'some res ID', + authority = 'some authority', + scope = 'some scope' + ) + +connection = sge.SimpleETL( + library = documentLibrary, + thumbprint = 'some thumbprint', + private_key = 'some private key' +) + +connection.fetch('/remote/dir/path') # Create local copies of child files at specified remote path + +transform_file('ExampleFile.txt') # Transform local file + +connection.delete('/remote/dir/path', 'ExampleFile.txt') # Delete remote copy of file as it will be replaced + +connection.upload('/remote/dir/path', 'ExampleFile.txt') # Upload local copy of file to same location as original + +``` + +## TODO + +Add tests + +Peer review + +Create detailed usage spec docs + +Publish + +Move to enterprise space? diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..9402dfe --- /dev/null +++ b/setup.py @@ -0,0 +1,14 @@ +from setuptools import find_packages, setup + +setup( + name='simple_graph_etl', + packages=find_packages(include=['simple_graph_etl']), + version='1.1.0', + description="Minimal wrapper lib for Python ETLs using Microsoft's Graph API", + author='glennpai / chglenn20@gmail.com', + license='MIT', + install_requires=['msal', 'requests'], + setup_requires=['pytest-runner'], + tests_require=['pytest==4.4.1'], + test_suite='tests', +) diff --git a/simple_graph_etl/__init__.py b/simple_graph_etl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/simple_graph_etl/documentlibrary.py b/simple_graph_etl/documentlibrary.py new file mode 100644 index 0000000..c8b8c4e --- /dev/null +++ b/simple_graph_etl/documentlibrary.py @@ -0,0 +1,43 @@ +""" +Module to unify and simplify configuration of a SharePoint document library for use in +a Python ETL +""" +class DocumentLibrary: + """ + A class containing configuration for accessing a SharePoint document library + via the Graph API + + Attributes: + client_id (string): Client ID for Azure Active Directory subscription + site_id (string): Site ID for a library's parent SharePoint site + res_id (string): Resource ID of a SharePoint document library + authority (string): Authority string for an Azure app registration + scope (string): Permission scopes of the user authenticating to the Azure app registration + base_url (string): Base URL of the document library formed from the Site and Res IDs + """ + # Pylint(R0913:too-many-arguments) + # Ignoring in interest of keeping config flat and consistent + def __init__(self, client_id, site_id, res_id, authority, scope): + self.client_id = client_id + self.site_id = site_id + self.res_id = res_id + self.authority = authority + self.scope = scope + self.base_url = self.get_base_url() + + + def __repr__(self): + return f'DocumentLibrary({self.client_id},{self.site_id},{self.res_id}, \ + {self.authority},{self.scope},{self.base_url}' + + + def get_base_url(self): + """ + Returns base URL used in most ETL functions via the Graph API + + Parameters: + Returns: + URL string constructed from site and res IDs + """ + return f'https://graph.microsoft.com/v1.0/sites/ \ + {self.site_id}/drives/{self.res_id}' diff --git a/simple_graph_etl/simpleetl.py b/simple_graph_etl/simpleetl.py new file mode 100644 index 0000000..144dff4 --- /dev/null +++ b/simple_graph_etl/simpleetl.py @@ -0,0 +1,198 @@ +""" +Module to simplify basic Python ETL interactions with a SharePoint document library +""" +import os +import re +import msal +import requests + +class SimpleETL: + """ + A class to simplify ETL functions perfomed on Azure app registrations and SharePoint + document libraries via the Graph API + + Class constructor accepts a DocumentLibrary instance and the required authentication + configuration + + Attributes: + library (DocumentLibrary): SharePoint document library configuration + __thumbprint [private] (string): Hash of signed certificate used when authenticating to the + Azure app registration + __private_key [private] (string): Private key used to authenticate to the Azure app + registration + __token [private] (string): Authentication token acquired from Azure app registration + """ + def __init__(self, document_library, thumbprint, private_key): + self.library = document_library + self.__thumbprint = thumbprint + self.__private_key = private_key + self.__token = self.__acquire_token() + + + @staticmethod + def __get_item_id(file_items, target_name): + """ + Gets item ID value from a file object if its name matches the target name + + Parameters: + file_items (any[]): List of file objects to check + target_name (string): Name to search for in list of file objects + Returns: + item_id (string): Item ID property value + """ + item_id = '' + + for item in file_items: + if item['name'] == target_name: + item_id = item['id'] + + return item_id + + + def __acquire_token(self): + """ + Authenticates against Azure app registration to get an auth token used for + calls to the Graph API + + Parameters: + Returns: + result['access_token'] (string): String value of auth token + """ + app = msal.ConfidentialClientApplication( + self.library.client_id, + authority=self.library.authority, + client_credential={'thumbprint': self.__thumbprint, 'private_key': self.__private_key}, + ) + result = None + result = app.acquire_token_silent([self.library.scope], account=None) + + if not result: + result = app.acquire_token_for_client(scopes=[self.library.scope]) + if 'access_token' in result: + return result['access_token'] + + raise Exception(result.get('error')) + + + def filenames(self, remote_path): + """ + Gets a list of file names that are children to the remote_path directory + Useful for checking existence of a remote file + + Parameters: + remote_path (string): Path to parent directory containing target files + Returns: + filenames (string[]): List of file names in the remote_path directory + """ + filenames = [] + file_list_resp = requests.get(f'{self.library.base_url}/root:/{remote_path}:/children', + headers={'Authorization': 'Bearer ' + self.__token}) + + if file_list_resp.status_code == 200: + objs = file_list_resp.json()['value'] + for obj in objs: + if obj['file']: + filenames.append(obj['name']) + else: + raise Exception('Bad response from the remote host.' + + f'{file_list_resp.raise_for_status()}') + + return filenames + + + def fetch(self, remote_path, local_path='.'): + """ + Creates a local copy of files contained in the document library at the remote_path + + Parameters: + remote_path (string): Path to parent directory containing target files + local_path (string): Path to local directory where files will be written - Default '.' + Returns: + """ + file_list_resp = requests.get(f'{self.library.base_url}/root:/{remote_path}:/children', + headers={'Authorization': 'Bearer ' + self.__token}) + if file_list_resp.status_code == 200: + objs = file_list_resp.json()['value'] + for obj in objs: + if not obj['file']: + continue + file_data = requests.get(obj['@microsoft.graph.downloadUrl']) + if file_data.status_code == 200: + try: + clean_path = re.sub(r'^(\\|\/)+|(\\|\/)+$', '', local_path) + if not os.path.exists(clean_path): + os.makedirs(clean_path) + with open(os.path.join(clean_path, obj['name']), 'wb') as file: + file.write(file_data.content) + except Exception as err: + raise f'Failed to write file data. {err}' + else: + raise Exception(f'Bad response fetching file "{obj["name"]}".' + + f'{file_data.raise_for_status()}') + else: + raise Exception('Bad response from the remote host.' + + f'{file_list_resp.raise_for_status()}') + + + def delete(self, remote_path, file_name): + """ + Deletes a remote file from a SharePoint document library based on file path + and name + + Parameters: + remote_path (string): Remote path of parent directory of file to delete + file_name (string): Name of remote file to delete + Returns: + """ + list_url = f'{self.library.base_url}/root:/{remote_path}:/children' + file_list_response = requests.get(list_url, + headers={'Authorization': 'Bearer ' + self.__token}) + + if file_list_response.status_code == 200: + item_id = self.__get_item_id(file_list_response.json()['value'], file_name) + if item_id != '': + delete_url = f'{self.library.base_url}/items/' + delete_response = requests.delete(delete_url + item_id, + headers={'Authorization': 'Bearer ' + self.__token}) + if delete_response.status_code != 204: + raise Exception(f'Failed to delete {file_name}. \ + {delete_response.raise_for_status()}') + else: + raise Exception(f'Failed to fetch item info for {file_name}') + else: + raise Exception(f'Failed to fetch file list from {remote_path}. \ + {file_list_response.raise_for_status()}') + + + def upload(self, file_name, remote_path, local_path='.'): + """ + Uploads a local file to a SharePoint document library at a specified remote_path + + Parameters: + local_file (string): Local file name and format + remote_path (string): Remote path of parent directory of file to upload + local_path (string): Local path to file - Default '.' + Returns: + """ + upload_session = requests.post(f'{self.library.base_url}/root:/ \ + {remote_path}/{file_name}:/createUploadSession', + headers={'Authorization': 'Bearer ' + self.__token}) + + if upload_session.status_code == 200: + upload_url = upload_session.json()['uploadUrl'] + try: + full_local = os.path.join(local_path, file_name) + with open(full_local, 'rb') as file: + file_size = os.path.getsize(full_local) + # Content length and content range are required headers. + # File data (bytes) is sent in body. + upload_response = requests.put(upload_url, + headers={'Content-Length': f'{file_size}', + 'Content-Range': f'bytes 0-{file_size - 1}/{file_size}'}, + data=file) + if upload_response.status_code != 201: + raise Exception(upload_response.raise_for_status()) + except Exception as err: + raise f'Failed to upload file to upload URL. {err}' + else: + raise Exception(f'Error retrieving upload URL. {upload_session.raise_for_status()}') diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/tests.py b/tests/tests.py new file mode 100644 index 0000000..e69de29