91
社区成员
发帖
与我相关
我的任务编写脚本步骤:
解析event_log.jsonl - 识别成功和失败的episode
删除失败episode的文件 - 删除data和videos目录下对应的文件
更新meta/episodes.jsonl - 删除失败记录,重新编号成功episode
更新meta/episodes_stats.jsonl - 同步更新统计文件
重命名文件 - 将文件重命名为新的连续索引(0,1,2,...)
更新parquet文件内容 - 更新parquet文件内部的episode_index字段
更新meta/info.json - 重新计算total_episodes、total_frames等统计信息
data_cleaner部分代码:
import json
import os
import shutil
from pathlib import Path
from collections import defaultdict
import pyarrow.parquet as pq
import pandas as pd
def parse_event_log(event_log_path):
"""
解析event_log.jsonl,识别成功和失败的episode
返回:
- success_episodes: list of (recording_idx, episode_idx) tuples
- mistake_episodes: list of (recording_idx, episode_idx) tuples
"""
success_episodes = []
mistake_episodes = []
with open(event_log_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
# 每3行一组处理
for i in range(0, len(lines), 3):
if i + 2 >= len(lines):
break
begin_line = json.loads(lines[i])
middle_line = json.loads(lines[i+1])
record_line = json.loads(lines[i+2])
recording_idx = begin_line['payload']['recording_idx']
episode_idx = record_line['payload']['episode_idx']
if middle_line['payload']['type'] == 'mark_mistake':
mistake_episodes.append((recording_idx, episode_idx))
elif middle_line['payload']['type'] == 'end_recording':
success_episodes.append((recording_idx, episode_idx))
return success_episodes, mistake_episodes
def delete_episode_files(data_dir, episode_idx, video_cameras=['cam_high', 'cam_left_wrist', 'cam_right_wrist']):
def update_episodes_jsonl(episodes_jsonl_path, mistake_episode_indices):
def update_episodes_stats_jsonl(episodes_stats_jsonl_path, mistake_episode_indices, episode_index_mapping):
def rename_episode_files(data_dir, episode_index_mapping, video_cameras=['cam_high', 'cam_left_wrist', 'cam_right_wrist']):
def update_info_json(info_json_path, updated_episodes, updated_stats):
def update_parquet_episode_indices(data_dir, episode_index_mapping):
def clean_data(data_dir):
data_dir = Path(data_dir)
event_log_path = data_dir / 'event_log.jsonl'
if not event_log_path.exists():
raise FileNotFoundError(f"event_log.jsonl not found in {data_dir}")
print("=" * 60)
print("开始数据清洗流程")
print("=" * 60)
# 1. 解析event_log识别失败的episode
print("\n[步骤1] 解析event_log.jsonl...")
success_episodes, mistake_episodes = parse_event_log(event_log_path)
mistake_episode_indices = {ep_idx for _, ep_idx in mistake_episodes}
success_episode_indices = {ep_idx for _, ep_idx in success_episodes}
print(f" 成功episode数: {len(success_episodes)}")
print(f" 失败episode数: {len(mistake_episodes)}")
print(f" 失败episode索引: {sorted(mistake_episode_indices)}")
if not mistake_episodes:
print(" 没有失败的episode,无需清洗")
return
# 2. 删除失败episode的文件
print(f"\n[步骤2] 删除失败episode的文件...")
for _, ep_idx in mistake_episodes:
print(f" 删除episode {ep_idx}的文件...")
delete_episode_files(data_dir, ep_idx)
# 3. 更新meta/episodes.jsonl并建立索引映射
print(f"\n[步骤3] 更新meta/episodes.jsonl...")
episodes_jsonl_path = data_dir / 'meta' / 'episodes.jsonl'
updated_episodes, episode_index_mapping = update_episodes_jsonl(episodes_jsonl_path, mistake_episode_indices)
print(f" 更新后episode数: {len(updated_episodes)}")
print(f" 索引映射: {episode_index_mapping}")
# 4. 更新meta/episodes_stats.jsonl
print(f"\n[步骤4] 更新meta/episodes_stats.jsonl...")
episodes_stats_jsonl_path = data_dir / 'meta' / 'episodes_stats.jsonl'
updated_stats = update_episodes_stats_jsonl(episodes_stats_jsonl_path, mistake_episode_indices, episode_index_mapping)
print(f" 更新后统计记录数: {len(updated_stats)}")
# 5. 重命名文件以匹配新索引
print(f"\n[步骤5] 重命名文件以匹配新索引...")
rename_episode_files(data_dir, episode_index_mapping)
# 6. 更新parquet文件中的episode_index字段
print(f"\n[步骤6] 更新parquet文件中的episode_index字段...")
update_parquet_episode_indices(data_dir, episode_index_mapping)
# 7. 更新meta/info.json
print(f"\n[步骤7] 更新meta/info.json...")
info_json_path = data_dir / 'meta' / 'info.json'
update_info_json(info_json_path, updated_episodes, updated_stats)
print(f" 更新统计信息完成")
print("\n" + "=" * 60)
print("数据清洗完成!")
print("=" * 60)
print(f"原始episode数: {len(success_episodes) + len(mistake_episodes)}")
print(f"清洗后episode数: {len(updated_episodes)}")
print(f"删除的episode: {sorted(mistake_episode_indices)}")
if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
data_dir = sys.argv[1]
else:
data_dir = ""
# 确认操作
response = input(f"确认要清洗数据目录 {data_dir} 吗?(yes/no): ")
if response.lower() != 'yes':
print("操作已取消")
sys.exit(0)
try:
clean_data(data_dir)
except Exception as e:
print(f"\n错误: {e}")
import traceback
traceback.print_exc()
python clean_data.py data_dir_to_your_path
91
社区成员
发帖
与我相关
我的任务加载中
「智能机器人开发者大赛」官方平台,致力于为开发者和参赛选手提供赛事技术指导、行业标准解读及团队实战案例解析;聚焦智能机器人开发全栈技术闭环,助力开发者攻克技术瓶颈,促进软硬件集成、场景应用及商业化落地
试试用AI创作助手写篇文章吧