Merge pull request #60 from ucbepic/shreyashankar/oss

docs: add documentation for using split gather pipeline
ucbepic · Oct 3, 2024 · a7b6373 · a7b6373
2 parents fcc3368 + 455125d
commit a7b6373
Show file tree

Hide file tree

Showing 12 changed files with 345 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ python --version
 1. Clone the DocETL repository:
 
 ```bash
-git clone https://github.com/shreyashankar/docetl.git
+git clone https://github.com/ucbepic/docetl.git
 cd docetl
 ```
 

diff --git a/docetl/builder.py b/docetl/builder.py
@@ -80,6 +80,12 @@ def items(self):
 class Optimizer:
     @classmethod
     def from_yaml(cls, yaml_file: str, **kwargs):
+        # check that file ends with .yaml or .yml
+        if not yaml_file.endswith(".yaml") and not yaml_file.endswith(".yml"):
+            raise ValueError(
+                "Invalid file type. Please provide a YAML file ending with '.yaml' or '.yml'."
+            )
+
         base_name = yaml_file.rsplit(".", 1)[0]
         suffix = yaml_file.split("/")[-1].split(".")[0]
         config = load_config(yaml_file)

diff --git a/docetl/operations/split.py b/docetl/operations/split.py
@@ -52,7 +52,7 @@ def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
         method = self.config["method"]
         method_kwargs = self.config["method_kwargs"]
         encoder = tiktoken.encoding_for_model(
-            self.config["method_kwargs"].get("model", self.default_model)
+            self.config["method_kwargs"].get("model", self.default_model).split("/")[-1]
         )
         results = []
         cost = 0.0

diff --git a/docetl/operations/utils.py b/docetl/operations/utils.py
@@ -634,7 +634,9 @@ def truncate_messages(
     """
     Truncate the messages to fit the model's context length.
     """
-    model_input_context_length = model_cost.get(model, {}).get("max_input_tokens", 8192)
+    model_input_context_length = model_cost.get(model.split("/")[-1], {}).get(
+        "max_input_tokens", 8192
+    )
     total_tokens = sum(count_tokens(json.dumps(msg), model) for msg in messages)
 
     if total_tokens <= model_input_context_length - 100:
@@ -645,7 +647,10 @@ def truncate_messages(
     content = longest_message["content"]
     excess_tokens = total_tokens - model_input_context_length + 200  # 200 token buffer
 
-    encoder = tiktoken.encoding_for_model(model)
+    try:
+        encoder = tiktoken.encoding_for_model(model.split("/")[-1])
+    except Exception:
+        encoder = tiktoken.encoding_for_model("gpt-4o")
     encoded_content = encoder.encode(content)
     tokens_to_remove = min(len(encoded_content), excess_tokens)
     mid_point = len(encoded_content) // 2

diff --git a/docetl/runner.py b/docetl/runner.py
@@ -69,6 +69,12 @@ def __init__(self, config: Dict, max_threads: int = None):
 
     @classmethod
     def from_yaml(cls, yaml_file: str, **kwargs):
+        # check that file ends with .yaml or .yml
+        if not yaml_file.endswith(".yaml") and not yaml_file.endswith(".yml"):
+            raise ValueError(
+                "Invalid file type. Please provide a YAML file ending with '.yaml' or '.yml'."
+            )
+
         config = load_config(yaml_file)
         return cls(config, **kwargs)
 

diff --git a/docs/community/index.md b/docs/community/index.md
@@ -10,7 +10,7 @@ While we don't have a formal code of conduct page, we expect all community membe
 
 We welcome contributions from everyone who is interested in improving DocETL. Here's how you can get involved:
 
-1. **Report Issues**: If you encounter a bug or have a feature request, open an issue on our [GitHub repository](https://github.com/shreyashankar/docetl/issues).
+1. **Report Issues**: If you encounter a bug or have a feature request, open an issue on our [GitHub repository](https://github.com/ucbepic/docetl/issues).
 
 2. **Join Discussions**: Have a question or want to discuss ideas? Post on our [Discord server](https://discord.gg/fHp7B2X3xx).
 
@@ -27,7 +27,7 @@ To contribute code:
 
 ## Connect with Us
 
-- **GitHub Repository**: Contribute to the project or report issues on our [GitHub repo](https://github.com/shreyashankar/docetl).
+- **GitHub Repository**: Contribute to the project or report issues on our [GitHub repo](https://github.com/ucbepic/docetl).
 - **Discord Community**: Join our [Discord server](https://discord.gg/fHp7B2X3xx); we're looking to build a vibrant community of people interested in intelligent document processing.
 - **Lab Webpages**: We are affiliated with the EPIC Lab at UC Berkeley. Visit our [Lab Page](https://epic.berkeley.edu) for a description of our research. We are also affiliated with the Data Systems and Foundations group at UC Berkeley--visit our [DSF Page](https://dsf.berkeley.edu) for more information.
 

diff --git a/docs/examples/mining-product-reviews.md b/docs/examples/mining-product-reviews.md
@@ -521,4 +521,4 @@ Upon further analysis, 1421 themes is still a lot! I realized that my resolve op
 
 Something else we could have done is included a list of themes we care about in the original map operation, e.g., graphics. Since our map prompt was very open-ended, the LLM could have generated themes that we didn't care about, leading to a large number of themes in the output.
 
-Anyways, we've filtered the 1421 reports down to 65 themes/reports that contain quotes from 3 or more different games. You can check out the output [here](https://github.com/shreyashankar/docetl/blob/main/example_data/steamgames/frequent_polarizing_themes.json).
+Anyways, we've filtered the 1421 reports down to 65 themes/reports that contain quotes from 3 or more different games. You can check out the output [here](https://github.com/ucbepic/docetl/blob/main/example_data/steamgames/frequent_polarizing_themes.json).
diff --git a/docs/examples/presidential-debate-themes.md b/docs/examples/presidential-debate-themes.md
@@ -9,7 +9,7 @@ Our goal is to build a pipeline that will:
 1. Extract key themes and viewpoints from presidential debate transcripts
 2. Analyze how these themes have evolved over time, with references to specific debates and quotes
 
-You can take a look at the raw data [here](https://github.com/shreyashankar/docetl/tree/main/example_data/debates/data.json).
+You can take a look at the raw data [here](https://github.com/ucbepic/docetl/tree/main/example_data/debates/data.json).
 
 Let's examine the pipeline structure and its operations:
 
@@ -319,7 +319,7 @@ This output shows the progress of our pipeline execution, including the differen
 
 ## Initial Results
 
-Our pipeline generated reports on various themes discussed in the presidential debates. We've put the results up [here](https://github.com/shreyashankar/docetl/tree/main/example_data/debates/theme_evolution_analysis_baseline.json). However, upon inspection, we found that these reports were lacking in depth and recency. Let's look at a few examples:
+Our pipeline generated reports on various themes discussed in the presidential debates. We've put the results up [here](https://github.com/ucbepic/docetl/tree/main/example_data/debates/theme_evolution_analysis_baseline.json). However, upon inspection, we found that these reports were lacking in depth and recency. Let's look at a few examples:
 
 !!! example "Example Reports Lacking in Recent Quotes"
 
@@ -509,7 +509,7 @@ Validator improvements (gleaning round 1):
 3. **Structure and Flow**: Overall, the output is fairly well-structured and maintains a logical flow, using headings appropriately to signal key sections. However, it may benefit from clearer subsections under each party's overview to delineate specific key points, such as 'Tax Policy', 'Job Creation', and 'Response to Economic Crises'. This would enhance readability and assist the reader in navigating the shifts in viewpoints. For example, adding bullet points or more vivid transitions between historical periods could clarify the evolution timeline. Moreover, resolving any redundancy (such as multiple mentions of similar themes across years) would streamline the narrative.
 ```
 
-Check out the new output [here](https://github.com/shreyashankar/docetl/tree/main/example_data/debates/theme_evolution_analysis_reduce_gleaning.json) to see the improvements made by the optimized pipeline! Of course, we can probably optimize the initial map operation too, do prompt engineering, and more to further enhance the pipeline.
+Check out the new output [here](https://github.com/ucbepic/docetl/tree/main/example_data/debates/theme_evolution_analysis_reduce_gleaning.json) to see the improvements made by the optimized pipeline! Of course, we can probably optimize the initial map operation too, do prompt engineering, and more to further enhance the pipeline.
 
 !!! note "Interactive Pipeline Creation"
Original file line number	Diff line number	Diff line change
Expand Up		@@ -521,4 +521,4 @@ Upon further analysis, 1421 themes is still a lot! I realized that my resolve op

		Something else we could have done is included a list of themes we care about in the original map operation, e.g., graphics. Since our map prompt was very open-ended, the LLM could have generated themes that we didn't care about, leading to a large number of themes in the output.

		Anyways, we've filtered the 1421 reports down to 65 themes/reports that contain quotes from 3 or more different games. You can check out the output [here](https://github.com/shreyashankar/docetl/blob/main/example_data/steamgames/frequent_polarizing_themes.json).
		Anyways, we've filtered the 1421 reports down to 65 themes/reports that contain quotes from 3 or more different games. You can check out the output [here](https://github.com/ucbepic/docetl/blob/main/example_data/steamgames/frequent_polarizing_themes.json).