Spaces:
Running
Running
| import dask | |
| import dask.dataframe as dd | |
| from dask.diagnostics import ProgressBar | |
| with ProgressBar(): | |
| ddf = dd.read_csv( | |
| "../datasets/YFCC100M/yfcc100m_dataset", | |
| names=[ | |
| "photo_id", | |
| "user_nsid", | |
| "user_nickname", | |
| "date_taken", | |
| "date_uploaded", | |
| "capture_device", | |
| "title", | |
| "description", | |
| "user_tags", | |
| "machine_tags", | |
| "longitude", | |
| "latitude", | |
| "accuracy", | |
| "page_url", | |
| "download_url", | |
| "license_name", | |
| "license_url", | |
| "server_id", | |
| "farm_id", | |
| "secret", | |
| "secret_original", | |
| "extension", | |
| "media_type", | |
| ], | |
| dtype={ | |
| "photo_id": str, | |
| "user_nsid": str, | |
| "user_nickname": str, | |
| "user_tags": str, | |
| "machine_tags": str, | |
| "longitude": float, | |
| "latitude": float, | |
| "accuracy": float, | |
| "server_id": str, | |
| "farm_id": str, | |
| "secret": str, | |
| "secret_original": str, | |
| "extension": str, | |
| "media_type": float, | |
| }, | |
| sep="\t", | |
| ) | |
| ddf = ddf[ | |
| [ | |
| "photo_id", | |
| "longitude", | |
| "latitude", | |
| "accuracy", | |
| "extension", | |
| "download_url", | |
| "media_type", | |
| ] | |
| ] | |
| filtered_ddf = ddf[ | |
| ddf["longitude"].notnull() | |
| & ddf["latitude"].notnull() | |
| & (ddf["media_type"] == 0) | |
| ] | |
| del ddf["media_type"] | |
| hash_ddf = dd.read_csv( | |
| "../datasets/YFCC100M/yfcc100m_hash", | |
| names=["photo_id", "hash"], | |
| dtype={"photo_id": str, "hash": str}, | |
| sep="\t", | |
| ) | |
| filtered_ddf = filtered_ddf.merge(hash_ddf, on="photo_id", how="left") | |
| # Read the 4k photo IDs | |
| with open("../datasets/YFCC100M/yfcc_4k_ids.txt", "r") as f: | |
| test_photo_ids = set(f.read().splitlines()) | |
| # Split the dataframe based on whether photo_id is in test set | |
| filter = filtered_ddf["photo_id"].isin(test_photo_ids) | |
| test_ddf = filtered_ddf[filter] | |
| train_ddf = filtered_ddf[~filter] | |
| train_ddf = train_ddf[train_ddf["accuracy"] >= 12] | |
| # Save the split dataframes | |
| test_ddf.to_csv( | |
| "../datasets/YFCC100M/yfcc_4k_dataset_with_gps.csv", | |
| sep="\t", | |
| index=False, | |
| single_file=True, | |
| ) | |
| train_ddf = train_ddf.repartition(npartitions=len(train_ddf) // 100000 + 1) | |
| train_ddf.to_csv( | |
| "../datasets/YFCC100M/yfcc100m_dataset_with_gps_train/*.csv", | |
| sep="\t", | |
| index=False, | |
| single_file=False, | |
| ) | |