Added some vision methods

tinkerhub · Nov 23, 2023 · ccfba5f · ccfba5f
1 parent 676cc56
commit ccfba5f
Show file tree

Hide file tree

Showing 4 changed files with 94 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,5 @@
-# V.E.E.N.A (Virtual Event Engagement and Networking Anchor)
+# V.E.E.N.A (IN PROGRESS)
+Virtual Event Engagement and Networking Anchor
 
 
 ## What is this ?

diff --git a/requirements.txt b/requirements.txt
@@ -3,4 +3,5 @@ sounddevice
 numpy
 langchain
 python-dotenv
-keyboard
+keyboard
+opencv-python
diff --git a/src/audio.py b/src/audio.py
@@ -2,6 +2,7 @@
 import sounddevice as sd
 import numpy as np
 import keyboard
+from vision import has_speaker_left_stage
 import scipy.io.wavfile as wav
 
 def speak(text, client: OpenAI):
@@ -41,6 +42,40 @@ def record_speech(fs=44100):
     print("Recording completed.")
     return filename
 
+def record_speech_vision(fs=44100):
+    """
+    This function records the speech of the speaker.
+    :param fs: The sample rate for the recording. Default is 44100.
+    :return: The filename of the recorded speech.
+    """
+    print("Recording... Stop when speaker leaves the stage.")
+
+    # Start recording in a loop
+    recording = []
+    cap = cv2.VideoCapture(0)
+    client = OpenAI()
+
+    while True:
+        # Record audio for a short duration
+        short_recording = sd.rec(int(1 * fs), samplerate=fs, channels=2)
+        sd.wait()
+        recording.append(short_recording)
+
+        # Capture frame-by-frame
+        ret, frame = cap.read()
+
+        # If the speaker has left the stage, stop recording
+        if has_speaker_left_stage(frame, client):
+            break
+
+    # Concatenate all short recordings
+    recording = np.concatenate(recording, axis=0)
+
+    filename = "recorded_speech.wav"
+    wav.write(filename, fs, np.int16(recording))
+    print("Recording completed.")
+    return filename
+
 def speech_to_text(filename, client: OpenAI):
     # Open the audio file in binary mode
     with open(filename, 'rb') as audio_file:

diff --git a/src/vision.py b/src/vision.py
@@ -0,0 +1,55 @@
+from openai import OpenAI
+import cv2
+import base64
+from PIL import Image
+import io
+
+
+def convert_frame_to_image_data(frame):
+    """
+    This function converts a frame captured from a webcam to a format suitable for the OpenAI API.
+    :param frame: The frame to convert.
+    :return: The converted frame.
+    """
+    # Convert the frame to RGB
+    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+
+    # Convert the frame to a PIL Image
+    pil_image = Image.fromarray(frame)
+
+    # Convert the PIL Image to base64
+    buffered = io.BytesIO()
+    pil_image.save(buffered, format="JPEG")
+    img_str = base64.b64encode(buffered.getvalue())
+    return img_str.decode('ascii')
+
+def has_speaker_left_stage(frame, client: OpenAI):
+    """
+    This function uses OpenAI's GPT-4 Vision model to analyze a video frame and determine if a speaker has left the stage.
+    :param frame: The video frame to analyze.
+    :param client: The OpenAI client.
+    :return: True if the speaker has left the stage, False otherwise.
+    """
+    # Convert the frame to a format suitable for the OpenAI API
+    # This is a placeholder and needs to be replaced with actual code to convert the frame to a suitable format
+    image_data = convert_frame_to_image_data(frame)
+
+    response = client.chat.completions.create(
+        model="gpt-4-vision-preview",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "There a person in the image. True or False?"},
+                    {"type": "image", "image": {"base64": image_data}},
+                ],
+            }
+        ],
+        max_tokens=300,
+    )
+
+    # Interpret the response from the OpenAI API
+    is_person_in_image = True if response.choices[0].text == "True" else False
+
+    # This is a placeholder and needs to be replaced with actual code to interpret the response
+    return is_person_in_image