-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfunctions.py
More file actions
125 lines (99 loc) · 3.64 KB
/
functions.py
File metadata and controls
125 lines (99 loc) · 3.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import hashlib
import random
import time
from typing import List
import pandas as pd
from pandas import DataFrame
# Adjusting the MediaSchemaWithNames to include a url field
class MediaSchemaWithNames:
title: str
release_date: str
keywords: List[str]
raw_metadata: str
url: str
def media_service_search_person(
media_service: str, first_name: str, last_name: str, start: int, max_results: int
) -> DataFrame:
id = hashlib.sha256(f"{first_name}{last_name}".encode()).hexdigest()[:10]
# Mock data generation
mock_data = [
{
"title": f"Mock Title {i}",
"release_date": "2023-01-01",
"keywords": ["mock", "data"],
"raw_metadata": "{'mock': 'data'}",
"url": f"https://www.youtube.com/watch?v={id}_{i}",
"person_first_name": first_name,
"person_last_name": last_name
}
for i in range(start, start + max_results)
]
# Create DataFrame
df = pd.DataFrame(mock_data)
# Adjust data types as in the original function
df["keywords"] = df["keywords"].astype(pd.StringDtype())
df["raw_metadata"] = df["raw_metadata"].astype(str)
return df
def is_speaker_in_video(person_name: str, description: str, model: str = "gpt-4") -> float:
"""
Determine if a given person is mentioned as a speaker in a video description.
Parameters
----------
person_name : str
Name of the person to check if they are mentioned as a speaker.
description : str
Description of the video.
model : str, optional
Name of the model to use for determination, by default "gpt-4".
Returns
-------
float
A probability value between 0.0 and 1.0 indicating the likelihood that
the person is mentioned as a speaker in the video description.
Raises
------
ValueError
If the returned probability value is not between 0.0 and 1.0.
Example
-------
>>> is_speaker_in_video("John Doe", "This video features talks by John Doe and Jane Smith.")
1.0
>>> is_speaker_in_video("John Doe", "In this documentary, we discuss the theories of John Doe.")
0.0
"""
time.sleep(2)
probability = 0.5
if probability >= 0.0 and probability <= 1.0:
return probability
else:
raise ValueError(f"probability returned is {probability} but should be between 0.0 and 1.0")
def get_diarized_transcript(media_service: str, url: str) -> pd.DataFrame:
NUM_ROWS = 5
# Set random seed based on hash of url
random.seed(int(hashlib.sha256(url.encode()).hexdigest(), 16))
sentences = [
"The quick brown fox jumps over the lazy dog.",
"She sells seashells by the seashore.",
"Peter Piper picked a peck of pickled peppers.",
"How much wood would a woodchuck chuck, if a woodchuck could chuck wood?",
"I scream, you scream, we all scream for ice cream!",
"The rain in Spain stays mainly in the plain.",
"To be or not to be, that is the question.",
"All work and no play makes Jack a dull boy.",
"A picture is worth a thousand words.",
"You can't handle the truth!"
]
# Mock data generation
speakers = ["John", "Jane", "Bob", "Alice"]
mock_data = {
"speaker": random.choices(speakers, k=NUM_ROWS),
"start": [random.uniform(0, 10) for _ in range(NUM_ROWS)],
"end": [random.uniform(0, 10) for _ in range(NUM_ROWS)],
"text": [random.choice(sentences) for _ in range(NUM_ROWS)]
}
# Create DataFrame
df = pd.DataFrame(mock_data)
# Reinitialize random seed
random.seed()
time.sleep(10)
return df