Skip to content

Commit

Permalink
Added some vision methods
Browse files Browse the repository at this point in the history
  • Loading branch information
gksoriginals committed Nov 23, 2023
1 parent 676cc56 commit ccfba5f
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 2 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# V.E.E.N.A (Virtual Event Engagement and Networking Anchor)
# V.E.E.N.A (IN PROGRESS)
Virtual Event Engagement and Networking Anchor


## What is this ?
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ sounddevice
numpy
langchain
python-dotenv
keyboard
keyboard
opencv-python
35 changes: 35 additions & 0 deletions src/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import sounddevice as sd
import numpy as np
import keyboard
from vision import has_speaker_left_stage
import scipy.io.wavfile as wav

def speak(text, client: OpenAI):
Expand Down Expand Up @@ -41,6 +42,40 @@ def record_speech(fs=44100):
print("Recording completed.")
return filename

def record_speech_vision(fs=44100):
"""
This function records the speech of the speaker.
:param fs: The sample rate for the recording. Default is 44100.
:return: The filename of the recorded speech.
"""
print("Recording... Stop when speaker leaves the stage.")

# Start recording in a loop
recording = []
cap = cv2.VideoCapture(0)
client = OpenAI()

while True:
# Record audio for a short duration
short_recording = sd.rec(int(1 * fs), samplerate=fs, channels=2)
sd.wait()
recording.append(short_recording)

# Capture frame-by-frame
ret, frame = cap.read()

# If the speaker has left the stage, stop recording
if has_speaker_left_stage(frame, client):
break

# Concatenate all short recordings
recording = np.concatenate(recording, axis=0)

filename = "recorded_speech.wav"
wav.write(filename, fs, np.int16(recording))
print("Recording completed.")
return filename

def speech_to_text(filename, client: OpenAI):
# Open the audio file in binary mode
with open(filename, 'rb') as audio_file:
Expand Down
55 changes: 55 additions & 0 deletions src/vision.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from openai import OpenAI
import cv2
import base64
from PIL import Image
import io


def convert_frame_to_image_data(frame):
"""
This function converts a frame captured from a webcam to a format suitable for the OpenAI API.
:param frame: The frame to convert.
:return: The converted frame.
"""
# Convert the frame to RGB
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

# Convert the frame to a PIL Image
pil_image = Image.fromarray(frame)

# Convert the PIL Image to base64
buffered = io.BytesIO()
pil_image.save(buffered, format="JPEG")
img_str = base64.b64encode(buffered.getvalue())
return img_str.decode('ascii')

def has_speaker_left_stage(frame, client: OpenAI):
"""
This function uses OpenAI's GPT-4 Vision model to analyze a video frame and determine if a speaker has left the stage.
:param frame: The video frame to analyze.
:param client: The OpenAI client.
:return: True if the speaker has left the stage, False otherwise.
"""
# Convert the frame to a format suitable for the OpenAI API
# This is a placeholder and needs to be replaced with actual code to convert the frame to a suitable format
image_data = convert_frame_to_image_data(frame)

response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "There a person in the image. True or False?"},
{"type": "image", "image": {"base64": image_data}},
],
}
],
max_tokens=300,
)

# Interpret the response from the OpenAI API
is_person_in_image = True if response.choices[0].text == "True" else False

# This is a placeholder and needs to be replaced with actual code to interpret the response
return is_person_in_image

0 comments on commit ccfba5f

Please sign in to comment.