Spaces:
Build error
Build error
| import duckdb | |
| import yaml | |
| import time | |
| def extract_base_models(tags): | |
| base_models = set() # Use a set to ensure uniqueness | |
| for tag in tags: | |
| if tag.startswith("base_model:"): | |
| base_model = tag.split(":")[-1] | |
| base_model = ''.join(c for c in base_model if c.isalnum() or c in '/_.-') | |
| base_models.add(base_model) # Add to set | |
| return list(base_models) # Convert back to list | |
| # Create a DuckDB connection | |
| con = duckdb.connect() | |
| # Register the Python UDF with explicit return type | |
| con.create_function("extract_base_models", extract_base_models, return_type="VARCHAR[]") | |
| # Query to extract base models using the Python UDF | |
| query = """ | |
| SELECT | |
| _id, | |
| id, | |
| extract_base_models(tags) AS base_models | |
| FROM parquet_scan('public/models.parquet') | |
| """ | |
| start_time = time.time() | |
| # Execute the query and create a view | |
| con.execute(f"CREATE VIEW parent_models AS {query}") | |
| # Write the view to a parquet file | |
| con.execute("COPY parent_models TO 'public/parents.parquet' (FORMAT 'parquet')") | |
| end_time = time.time() | |
| execution_time = end_time - start_time | |
| print(f"Query execution time: {execution_time:.2f} seconds") | |
| # Filter rows with non-empty base_models | |
| con.execute(""" | |
| CREATE VIEW non_empty_base_models AS | |
| SELECT * | |
| FROM parent_models | |
| WHERE ARRAY_LENGTH(base_models) > 0 | |
| """) | |
| # Write a random sample of 10 rows with non-empty base_models to yaml file for inspection | |
| result = con.execute(""" | |
| SELECT _id, id, base_models | |
| FROM non_empty_base_models | |
| ORDER BY RANDOM() | |
| LIMIT 10 | |
| """).fetchall() | |
| with open("public/parents.example.yaml", "w") as f: | |
| yaml.safe_dump(result, f, default_flow_style=False) | |