Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.env
venv
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing a new line at the end of the file.

60 changes: 59 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,59 @@
# simple-graph
# simple-graph-etl

## NOTE
This project has been moved into Ohio University's GitHub Enterprise project space. As such, this repo will not be updated as frequently until the full release of the package. It will then be cloned back to this repo for historical / showcase purposes.

---

Minimal wrapper lib for Python ETLs using Microsoft's Graph API

Designed with intent for use in Ohio University Python scripts interacting with the Graph API

## Example
### Example document library structure:
```
remote
└──dir
└──path
└──ExampleFile.txt
```

### Example ETL:
```Python
import simple_graph_etl as sge

documentLibrary = sge.DocumentLibrary(
client_id = 'some client ID',
site_id = 'some site ID',
res_id = 'some res ID',
authority = 'some authority',
scope = 'some scope'
)

connection = sge.SimpleETL(
library = documentLibrary,
thumbprint = 'some thumbprint',
private_key = 'some private key'
)

connection.fetch('/remote/dir/path') # Create local copies of child files at specified remote path

transform_file('ExampleFile.txt') # Transform local file

connection.delete('/remote/dir/path', 'ExampleFile.txt') # Delete remote copy of file as it will be replaced

connection.upload('/remote/dir/path', 'ExampleFile.txt') # Upload local copy of file to same location as original

```

## TODO

Add tests

Peer review

Create detailed usage spec docs

Publish

Move to enterprise space?
14 changes: 14 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from setuptools import find_packages, setup

setup(
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume you eventually want to render the README file to your package's profile page on the PyPI. PyPI won't automatically pick up the README file. You have to read the file and assign it to long_description. see more details at https://packaging.python.org/en/latest/guides/making-a-pypi-friendly-readme/

name='simple_graph_etl',
packages=find_packages(include=['simple_graph_etl']),
version='1.1.0',
description="Minimal wrapper lib for Python ETLs using Microsoft's Graph API",
author='glennpai / chglenn20@gmail.com',
license='MIT',
install_requires=['msal', 'requests'],
setup_requires=['pytest-runner'],
tests_require=['pytest==4.4.1'],
test_suite='tests',
)
Empty file added simple_graph_etl/__init__.py
Empty file.
43 changes: 43 additions & 0 deletions simple_graph_etl/documentlibrary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""
Module to unify and simplify configuration of a SharePoint document library for use in
a Python ETL
"""
class DocumentLibrary:
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on the content of this class and how it is used in the SimpleETL class. I feel like calling it DocumentLibraryConfig, DocumentLibraryOptions, or something similar might be more approperiate.

"""
A class containing configuration for accessing a SharePoint document library
via the Graph API

Attributes:
client_id (string): Client ID for Azure Active Directory subscription
site_id (string): Site ID for a library's parent SharePoint site
res_id (string): Resource ID of a SharePoint document library
authority (string): Authority string for an Azure app registration
scope (string): Permission scopes of the user authenticating to the Azure app registration
base_url (string): Base URL of the document library formed from the Site and Res IDs
"""
# Pylint(R0913:too-many-arguments)
# Ignoring in interest of keeping config flat and consistent
def __init__(self, client_id, site_id, res_id, authority, scope):
self.client_id = client_id
self.site_id = site_id
self.res_id = res_id
self.authority = authority
self.scope = scope
self.base_url = self.get_base_url()


def __repr__(self):
return f'DocumentLibrary({self.client_id},{self.site_id},{self.res_id}, \
{self.authority},{self.scope},{self.base_url}'


def get_base_url(self):
"""
Returns base URL used in most ETL functions via the Graph API

Parameters:
Returns:
URL string constructed from site and res IDs
"""
return f'https://graph.microsoft.com/v1.0/sites/ \
{self.site_id}/drives/{self.res_id}'
198 changes: 198 additions & 0 deletions simple_graph_etl/simpleetl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
"""
Module to simplify basic Python ETL interactions with a SharePoint document library
"""
import os
import re
import msal
import requests

class SimpleETL:
"""
A class to simplify ETL functions perfomed on Azure app registrations and SharePoint
document libraries via the Graph API

Class constructor accepts a DocumentLibrary instance and the required authentication
configuration

Attributes:
library (DocumentLibrary): SharePoint document library configuration
__thumbprint [private] (string): Hash of signed certificate used when authenticating to the
Azure app registration
__private_key [private] (string): Private key used to authenticate to the Azure app
registration
__token [private] (string): Authentication token acquired from Azure app registration
"""
def __init__(self, document_library, thumbprint, private_key):
self.library = document_library
self.__thumbprint = thumbprint
self.__private_key = private_key
self.__token = self.__acquire_token()
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's better to move the token exchange process out of the constructor and probably run it right before the first time an action (read/write/etc) requiring the authentication is executed, giving the following reasons:

  1. It doesn't make sense to spend time before it is actually needed. If someone set up the package but ends up not using it, then the resource is wasted, although developers shouldn't do that.
  2. It is harder to test. General speaking, in OOP, (with a few exceptions), the only thing a constructor should do is dependency injection, meaning assigns dependencies from the caller to its instance members. We usually don't have to test constructors, but doing business logic in it will then requires you to test the constructor itself.



@staticmethod
def __get_item_id(file_items, target_name):
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't have to stay inside the class as a static method. It seems like a utility function and can put it into a dedicate utility py file, or still stays in the same file with the class but not inside the class.

"""
Gets item ID value from a file object if its name matches the target name

Parameters:
file_items (any[]): List of file objects to check
target_name (string): Name to search for in list of file objects
Returns:
item_id (string): Item ID property value
"""
item_id = ''

for item in file_items:
if item['name'] == target_name:
item_id = item['id']

return item_id
Comment on lines +43 to +49
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. It makes more sense to return None rather than an empty string when the item is not found.
  2. be careful, Python throws a KeyError if access a non-existing key from a dict. This means that you may get trapped if there is no name in item, especially the file_items comes from an outside resource, you have no controls over what you may receive. I suggest to either use item.get('name') (returns None when not found) or check 'name' in item (returns a boolean) before using item['name'].
  3. the function can be simplified to
    for item in file_itmes:
        if item.get('name') == target_name:
            return item.get('id')
    return None



def __acquire_token(self):
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the token have an expiration? If so, how do you make sure the token is still valid when the consumer needs it? I usually don't ask this question if it is in a single app. But in terms of a library (that will be used by many projects with different requirements), it's better to make it robust. As not all apps/integrations complete quickly that won't get impacted by the token expiration.

"""
Authenticates against Azure app registration to get an auth token used for
calls to the Graph API

Parameters:
Returns:
result['access_token'] (string): String value of auth token
"""
app = msal.ConfidentialClientApplication(
self.library.client_id,
authority=self.library.authority,
client_credential={'thumbprint': self.__thumbprint, 'private_key': self.__private_key},
)
result = None
result = app.acquire_token_silent([self.library.scope], account=None)
Comment on lines +66 to +67
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does app.acquire_token_silent return None when no token cache is available? If so, then result = None is not necessary.


if not result:
result = app.acquire_token_for_client(scopes=[self.library.scope])
if 'access_token' in result:
return result['access_token']

raise Exception(result.get('error'))


def filenames(self, remote_path):
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My personal preference, listFiles is more self-explainable.

"""
Gets a list of file names that are children to the remote_path directory
Useful for checking existence of a remote file

Parameters:
remote_path (string): Path to parent directory containing target files
Returns:
filenames (string[]): List of file names in the remote_path directory
"""
filenames = []
file_list_resp = requests.get(f'{self.library.base_url}/root:/{remote_path}:/children',
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You may be able to use filter and select query parameter to:

  1. return files only https://docs.microsoft.com/en-us/graph/query-parameters#filter-parameter
  2. return only the attributes you need https://docs.microsoft.com/en-us/graph/query-parameters#select-parameter

This comment also applies to other places that make api calls.

headers={'Authorization': 'Bearer ' + self.__token})

if file_list_resp.status_code == 200:
objs = file_list_resp.json()['value']
for obj in objs:
if obj['file']:
filenames.append(obj['name'])
Comment on lines +92 to +95
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it can be simplified to:

files = filter(lambda obj: obj['file'], objs)
filenames = [ file['name'] for file in files ]

else:
raise Exception('Bad response from the remote host.' +
f'{file_list_resp.raise_for_status()}')

return filenames


def fetch(self, remote_path, local_path='.'):
"""
Creates a local copy of files contained in the document library at the remote_path

Parameters:
remote_path (string): Path to parent directory containing target files
local_path (string): Path to local directory where files will be written - Default '.'
Returns:
"""
file_list_resp = requests.get(f'{self.library.base_url}/root:/{remote_path}:/children',
headers={'Authorization': 'Bearer ' + self.__token})
if file_list_resp.status_code == 200:
objs = file_list_resp.json()['value']
for obj in objs:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to point these lines out in particular. There are a few ways to determine if an object is a file or a folder. You can check that it has the file property, and then use the download URL since you know it will exist. The second way is to just check that it has a download URL.
I think the way it is done here is fine since we don't care about folders, and only downloadable content in the drive. Just something worth pointing out since we can also use the properties to check for images and/or videos as well. This could be useful elsewhere.

Copy link
Copy Markdown
Owner

@glennpai glennpai Apr 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Included change in pull -> #3
The existence of obj['file'] is almost the same as what we're doing above but this 'file' property will exist only on filetype objects. I don't know if there are any non-filetype objects that would have a downloadURL (folders do not) but this change should prevent us from grabbing those if there are.

if not obj['file']:
continue
file_data = requests.get(obj['@microsoft.graph.downloadUrl'])
if file_data.status_code == 200:
try:
clean_path = re.sub(r'^(\\|\/)+|(\\|\/)+$', '', local_path)
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems like you can use os.path.normpath(...) here https://docs.python.org/3/library/os.path.html#os.path.normpath

if not os.path.exists(clean_path):
os.makedirs(clean_path)
with open(os.path.join(clean_path, obj['name']), 'wb') as file:
file.write(file_data.content)
except Exception as err:
raise f'Failed to write file data. {err}'
else:
raise Exception(f'Bad response fetching file "{obj["name"]}".' +
f'{file_data.raise_for_status()}')
else:
raise Exception('Bad response from the remote host.' +
f'{file_list_resp.raise_for_status()}')


def delete(self, remote_path, file_name):
"""
Deletes a remote file from a SharePoint document library based on file path
and name

Parameters:
remote_path (string): Remote path of parent directory of file to delete
file_name (string): Name of remote file to delete
Returns:
"""
list_url = f'{self.library.base_url}/root:/{remote_path}:/children'
file_list_response = requests.get(list_url,
headers={'Authorization': 'Bearer ' + self.__token})
Comment on lines +147 to +149
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, I saw similar code in the filenames method. To improve reusability, maybe you can create another method to handle the list files/dirs work and let filenames and delete methods use it rather than doing their own file listing.


if file_list_response.status_code == 200:
item_id = self.__get_item_id(file_list_response.json()['value'], file_name)
if item_id != '':
delete_url = f'{self.library.base_url}/items/'
delete_response = requests.delete(delete_url + item_id,
headers={'Authorization': 'Bearer ' + self.__token})
if delete_response.status_code != 204:
raise Exception(f'Failed to delete {file_name}. \
{delete_response.raise_for_status()}')
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

raise_for_status will actually raise an exception. It makes your raise Exception(...) pointless. There are several cases like that in this file.

else:
raise Exception(f'Failed to fetch item info for {file_name}')
else:
raise Exception(f'Failed to fetch file list from {remote_path}. \
{file_list_response.raise_for_status()}')


def upload(self, file_name, remote_path, local_path='.'):
"""
Uploads a local file to a SharePoint document library at a specified remote_path

Parameters:
local_file (string): Local file name and format
remote_path (string): Remote path of parent directory of file to upload
local_path (string): Local path to file - Default '.'
Returns:
"""
upload_session = requests.post(f'{self.library.base_url}/root:/ \
{remote_path}/{file_name}:/createUploadSession',
headers={'Authorization': 'Bearer ' + self.__token})

if upload_session.status_code == 200:
upload_url = upload_session.json()['uploadUrl']
try:
full_local = os.path.join(local_path, file_name)
with open(full_local, 'rb') as file:
file_size = os.path.getsize(full_local)
# Content length and content range are required headers.
# File data (bytes) is sent in body.
upload_response = requests.put(upload_url,
headers={'Content-Length': f'{file_size}',
'Content-Range': f'bytes 0-{file_size - 1}/{file_size}'},
data=file)
if upload_response.status_code != 201:
raise Exception(upload_response.raise_for_status())
except Exception as err:
raise f'Failed to upload file to upload URL. {err}'
else:
raise Exception(f'Error retrieving upload URL. {upload_session.raise_for_status()}')
Empty file added tests/__init__.py
Empty file.
Empty file added tests/tests.py
Empty file.