Merge pull request #9 from langchain-ai/brace/verify-github-links

Implement verify github & general links node
langchain-ai · Nov 23, 2024 · c4b06be · c4b06be
2 parents 62220cb + 6ff9d3f
commit c4b06be
Show file tree

Hide file tree

Showing 4 changed files with 294 additions and 10 deletions.
diff --git a/package.json b/package.json
@@ -23,11 +23,14 @@
   },
   "dependencies": {
     "@langchain/anthropic": "^0.3.8",
+    "@langchain/community": "^0.3.15",
     "@langchain/core": "^0.3.18",
     "@langchain/google-vertexai-web": "^0.1.2",
     "@langchain/langgraph": "^0.2.22",
+    "@mendable/firecrawl-js": "^1.8.5",
     "@slack/web-api": "^7.7.0",
-    "moment": "^2.30.1"
+    "moment": "^2.30.1",
+    "zod": "^3.23.8"
   },
   "devDependencies": {
     "@eslint/eslintrc": "^3.1.0",

diff --git a/src/agent/subgraphs/generate-post/nodes/verify-general.ts b/src/agent/subgraphs/generate-post/nodes/verify-general.ts
@@ -1,17 +1,90 @@
 import { LangGraphRunnableConfig } from "@langchain/langgraph";
-import { GraphAnnotation } from "../state.js";
+import { GraphAnnotation, VerifyContentAnnotation } from "../state.js";
+import { z } from "zod";
+import { ChatAnthropic } from "@langchain/anthropic";
+import { FireCrawlLoader } from "@langchain/community/document_loaders/web/firecrawl";
 
 type VerifyGeneralContentReturn = {
   relevantLinks: (typeof GraphAnnotation.State)["relevantLinks"];
   pageContents: (typeof GraphAnnotation.State)["pageContents"];
 };
 
+const RELEVANCY_SCHEMA = z
+  .object({
+    reasoning: z
+      .string()
+      .describe(
+        "Reasoning for why the webpage is or isn't relevant to LangChain's products.",
+      ),
+    relevant: z
+      .boolean()
+      .describe(
+        "Whether or not the webpage is relevant to LangChain's products.",
+      ),
+  })
+  .describe("The relevancy of the content to LangChain's products.");
+
+const VERIFY_LANGCHAIN_RELEVANT_CONTENT_PROMPT = `You are a highly regarded marketing employee at LangChain.
+You're provided with a webpage containing content a third party submitted to LangChain claiming it's relevant and implements LangChain's products.
+Your task is to carefully read over the entire page, and determine whether or not the content actually implements and is relevant to LangChain's products.
+You're doing this to ensure the content is relevant to LangChain, and it can be used as marketing material to promote LangChain.
+
+For context, LangChain has three main products you should be looking out for:
+- **LangChain** - the main open source libraries developers use for building AI applications. These are open source Python/JavaScript/TypeScript libraries.
+- **LangGraph** - an open source library for building agentic AI applications. This is a Python/JavaScript/TypeScript library.
+  LangChain also offers a hosted cloud platform called 'LangGraph Cloud' or 'LangGraph Platform' which developers can use to host their LangGraph applications in production.
+- **LangSmith** - this is LangChain's SaaS product for building AI applications. It offers solutions for evaluating AI systems, observability, datasets and testing.
+
+Given this context, examine the webpage content closely, and determine if the content implements LangChain's products.
+You should provide reasoning as to why or why not the content implements LangChain's products, then a simple true or false for whether or not it implements some.`;
+
 /**
  * Verifies the content provided is relevant to LangChain products.
  */
 export async function verifyGeneralContent(
-  _state: typeof GraphAnnotation.State,
+  state: typeof VerifyContentAnnotation.State,
   _config: LangGraphRunnableConfig,
 ): Promise<VerifyGeneralContentReturn> {
-  throw new Error("Not implemented");
+  const relevancyModel = new ChatAnthropic({
+    model: "claude-3-5-sonnet-20241022",
+    temperature: 0,
+  }).withStructuredOutput(RELEVANCY_SCHEMA, {
+    name: "relevancy",
+  });
+
+  const loader = new FireCrawlLoader({
+    url: state.link, // The URL to scrape
+    mode: "crawl",
+  });
+  const docs = await loader.load();
+  const pageContent = docs.map((d) => d.pageContent).join("\n");
+
+  const { relevant } = await relevancyModel
+    .withConfig({
+      runName: "check-general-relevancy-model",
+    })
+    .invoke([
+      {
+        role: "system",
+        content: VERIFY_LANGCHAIN_RELEVANT_CONTENT_PROMPT,
+      },
+      {
+        role: "user",
+        content: pageContent,
+      },
+    ]);
+
+  if (relevant) {
+    return {
+      // TODO: Replace with actual relevant link/page content (summary in this case)
+      relevantLinks: [state.link],
+      pageContents: [pageContent],
+    };
+  }
+
+  // Not relevant, return empty arrays so this URL is not included.
+  return {
+    relevantLinks: [],
+    pageContents: [],
+  };
 }
diff --git a/src/agent/subgraphs/generate-post/nodes/verify-github.ts b/src/agent/subgraphs/generate-post/nodes/verify-github.ts
@@ -1,17 +1,110 @@
+import { z } from "zod";
 import { LangGraphRunnableConfig } from "@langchain/langgraph";
-import { GraphAnnotation } from "../state.js";
+import { GraphAnnotation, VerifyContentAnnotation } from "../state.js";
+import { ChatAnthropic } from "@langchain/anthropic";
 
 type VerifyGitHubContentReturn = {
   relevantLinks: (typeof GraphAnnotation.State)["relevantLinks"];
   pageContents: (typeof GraphAnnotation.State)["pageContents"];
 };
 
+const RELEVANCY_SCHEMA = z
+  .object({
+    reasoning: z
+      .string()
+      .describe(
+        "Reasoning for why the Readme of the GitHub repository is or isn't relevant to LangChain's products.",
+      ),
+    relevant: z
+      .boolean()
+      .describe(
+        "Whether or not the Readme of the GitHub repository is relevant to LangChain's products.",
+      ),
+  })
+  .describe("The relevancy of the content to LangChain's products.");
+
+const VERIFY_LANGCHAIN_RELEVANT_CONTENT_PROMPT = `You are a highly regarded marketing employee at LangChain.
+You're given the Readme of a GitHub repository and need to verify the repository implements LangChain's products.
+You're doing this to ensure the content is relevant to LangChain, and it can be used as marketing material to promote LangChain.
+
+For context, LangChain has three main products you should be looking out for:
+- **LangChain** - the main open source libraries developers use for building AI applications. These are open source Python/JavaScript/TypeScript libraries.
+- **LangGraph** - an open source library for building agentic AI applications. This is a Python/JavaScript/TypeScript library.
+  LangChain also offers a hosted cloud platform called 'LangGraph Cloud' or 'LangGraph Platform' which developers can use to host their LangGraph applications in production.
+- **LangSmith** - this is LangChain's SaaS product for building AI applications. It offers solutions for evaluating AI systems, observability, datasets and testing.
+
+Given this context, examine the Readme closely, and determine if the repository implements LangChain's products.
+You should provide reasoning as to why or why not the repository implements LangChain's products, then a simple true or false for whether or not it implements some.`;
+
 /**
  * Verifies the content provided is relevant to LangChain products.
  */
 export async function verifyGitHubContent(
-  _state: typeof GraphAnnotation.State,
+  state: typeof VerifyContentAnnotation.State,
   _config: LangGraphRunnableConfig,
 ): Promise<VerifyGitHubContentReturn> {
-  throw new Error("Not implemented");
+  const relevancyModel = new ChatAnthropic({
+    model: "claude-3-5-sonnet-20241022",
+    temperature: 0,
+  }).withStructuredOutput(RELEVANCY_SCHEMA, {
+    name: "relevancy",
+  });
+
+  let baseGitHubRepoUrl = "";
+  try {
+    const githubUrl = new URL(state.link);
+    // Ensure the url only contains the owner/repo path
+    baseGitHubRepoUrl = githubUrl.pathname.split("/").slice(0, 3).join("/");
+  } catch (e) {
+    console.error("Failed to parse GitHub URL", e);
+    return {
+      relevantLinks: [],
+      pageContents: [],
+    };
+  }
+
+  const rawMainReadmeLink = `${baseGitHubRepoUrl}/refs/heads/main/README.md`;
+  const rawMasterReadmeLink = `${baseGitHubRepoUrl}/refs/heads/master/README.md`;
+  // Attempt to fetch the contents of main, if it fails, try master, finally, just read the content of the original URL.
+  let readmeContent = "";
+  try {
+    readmeContent = await fetch(rawMainReadmeLink).then((res) => res.text());
+  } catch (_) {
+    try {
+      readmeContent = await fetch(rawMasterReadmeLink).then((res) =>
+        res.text(),
+      );
+    } catch (_) {
+      readmeContent = await fetch(state.link).then((res) => res.text());
+    }
+  }
+
+  const { relevant } = await relevancyModel
+    .withConfig({
+      runName: "check-github-relevancy-model",
+    })
+    .invoke([
+      {
+        role: "system",
+        content: VERIFY_LANGCHAIN_RELEVANT_CONTENT_PROMPT,
+      },
+      {
+        role: "user",
+        content: readmeContent,
+      },
+    ]);
+
+  if (relevant) {
+    return {
+      // TODO: Replace with actual relevant link/page content (summary in this case)
+      relevantLinks: [state.link],
+      pageContents: [readmeContent],
+    };
+  }
+
+  // Not relevant, return empty arrays so this URL is not included.
+  return {
+    relevantLinks: [],
+    pageContents: [],
+  };
 }