Skip to content

Commit c64aa17

Browse files
authored
Merge pull request #56 from chengzheng345/main
Remove the hosts Settings for multiple machines and only support sing…
2 parents 2e2adda + 8d4d9c4 commit c64aa17

File tree

10 files changed

+14
-241
lines changed

10 files changed

+14
-241
lines changed

examples/llava_ov_1_5/sample_packing/1_s1_get_tokenlens_v3-sft.py

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,29 +16,12 @@
1616
import multiprocessing
1717
from multiprocessing import Pool, Manager, Value
1818
from tqdm import tqdm
19-
from tool import get_ip_info,cfg
19+
from tool import cfg,get_init_file
2020

2121
# Declares a global cross-process counter (defined in the main module for child processes to inherit)
2222
global_total_counter = None
23-
MAX_TOKEN_LEN = cfg['sample']['max_len']
2423
task_type = cfg['sample']['task_type']
25-
DEFAULT_DIRECTORY = Path(cfg['data']['directory'])
26-
try:
27-
ip_index,_,_=get_ip_info(cfg['hosts'])
28-
print('success init ip ,>>>>>>>>>>>>>>>>>>>>>>>>')
29-
except:
30-
print(f"getting ip_index error, default to 0")
31-
ip_index=0
32-
DEFAULT_DIRECTORY=Path(os.path.join(DEFAULT_DIRECTORY,f'part_{ip_index:02d}'))
33-
save_files_dir=os.path.join(DEFAULT_DIRECTORY,"save_files")
34-
if os.path.exists(save_files_dir) is False:
35-
os.makedirs(save_files_dir)
36-
37-
OUTPUT_FILE = Path(cfg['data']['output_base'])
38-
TOKEN_INFO_FILE = Path(cfg['data']['output_token'])
39-
OUTPUT_FILE=os.path.join(save_files_dir,OUTPUT_FILE)
40-
TOKEN_INFO_FILE=os.path.join(save_files_dir,TOKEN_INFO_FILE)
41-
24+
TOKEN_INFO_FILE,OUTPUT_FILE,MAX_TOKEN_LEN,save_files_dir,big_dir,DEFAULT_DIRECTORY=get_init_file()
4225

4326
CKPT_DIR = cfg['model']['checkpoint']
4427
MIN_PIXELS = cfg['image']['min_pixels']

examples/llava_ov_1_5/sample_packing/2_do_hashbacket.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
from pprint import pprint
33
import os
44
import yaml
5-
from tool import get_ip_info
65
import random
76
from tool import get_init_file
87

@@ -26,7 +25,7 @@ def get_hs(hs):
2625
return mean,min_,max_,num
2726

2827
def init():
29-
input_file ,MAX_TOKEN_LEN,save_files_dir,_,_= get_init_file()
28+
input_file ,_,MAX_TOKEN_LEN,save_files_dir,_,_= get_init_file()
3029
if not os.path.exists(input_file):
3130
print(f" file {input_file} does not exist!" )
3231
processor=None

examples/llava_ov_1_5/sample_packing/3_s2_prepare_rawsamples-vqa.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from concurrent.futures import ThreadPoolExecutor, as_completed
88
from tool import get_init_file
99

10-
input_token_file ,MAX_TOKEN_LEN,save_files_dir,big_dir,DEFAULT_DIRECTORY= get_init_file()
10+
input_token_file,_ ,MAX_TOKEN_LEN,save_files_dir,big_dir,DEFAULT_DIRECTORY= get_init_file()
1111
SRC_DIR_IMGS = DEFAULT_DIRECTORY # The storage location of image data
1212
SRC_DIR_JSONS = DEFAULT_DIRECTORY # The storage location of json data
1313
SRC_DST_EXTENSIONS = ("jpg", "json")

examples/llava_ov_1_5/sample_packing/4_convert_packedsample_to_wds.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ def write_config(path: EPath, media=None, template_func=None, class_name=None):
157157

158158
def _add_arguments(parser: argparse.ArgumentParser):
159159

160-
input_token_file ,MAX_TOKEN_LEN,save_files_dir,big_dir,DEFAULT_DIRECTORY= get_init_file()
160+
input_token_file ,_,MAX_TOKEN_LEN,save_files_dir,big_dir,DEFAULT_DIRECTORY= get_init_file()
161161
output_dir=DEFAULT_DIRECTORY+'_wds'
162162
last_save_dir_json=os.path.join(save_files_dir,"row_packing_jsons")
163163
last_save_dir_image=os.path.join(save_files_dir,"row_packing_images")

examples/llava_ov_1_5/sample_packing/5_make_mix_wds_config.py

Lines changed: 0 additions & 166 deletions
This file was deleted.

examples/llava_ov_1_5/sample_packing/README.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
Download the required data from [LLaVA-One-Vision-1.5-Mid-Training-85M](https://huggingface.co/datasets/lmms-lab/LLaVA-One-Vision-1.5-Mid-Training-85M/tree/main)
44
## 2. Configure config.yaml
55
The following are the key parameter configurations in the config.yaml file:
6-
- **hosts**: Specify the IP for offline packing. For a single machine, only one IP needs to be entered.
76
- **hf_data**: The address of the downloaded data.
87
- **directory**: The save address for processing results.
98
- **checkpoint**: The address of the tokenizer used for VLM.
@@ -16,8 +15,6 @@ Run the offline_packing_pipeline.sh script. This script will sequentially execut
1615
3. Perform packing on the samples.
1716
4. Prepare for generating WebDataset.
1817
5. Package the packing results into WebDataset.
19-
6. Generate the configuration file for WebDataset.
20-
2118

2219

2320

examples/llava_ov_1_5/sample_packing/config.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
hosts: "host_1.txt"
21
hf_data: "your_LLaVA-One-Vision-1.5-Mid-Training-85M_data_path"
32
data:
43
directory: "your_to_save_result_path"

examples/llava_ov_1_5/sample_packing/huggingface_data_parse.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from datasets import load_dataset
22
from multiprocessing import Pool
3-
from tool import cfg,get_ip_info,get_init_file
3+
from tool import cfg,get_init_file
44
import os
55
from functools import partial
66
from tqdm import tqdm
@@ -36,11 +36,9 @@ def check_image(image_path) -> bool:
3636
except Exception as e:
3737
return False
3838

39-
def parese_dataset(data_item,ip_indx,ip_num,dst_dir):
39+
def parse_dataset(data_item,dst_dir):
4040
try:
4141
index, item = data_item
42-
if index%ip_num!=ip_indx:
43-
return
4442
name=item['id'].replace('/','_')
4543
name=os.path.splitext(name)[0]
4644

@@ -76,12 +74,11 @@ def parese_dataset(data_item,ip_indx,ip_num,dst_dir):
7674

7775
def main(workers):
7876
data_path=cfg['hf_data']
79-
TOKEN_INFO_FILE,MAX_TOKEN_LEN,save_files_dir,big_dir,DEFAULT_DIRECTORY=get_init_file()
80-
dataset = load_dataset(data_path,data_files='*/*.parquet', split="train", streaming=True)
77+
DEFAULT_DIRECTORY=get_init_file()[-1]
78+
dataset = load_dataset(data_path,data_files='*/*/*.parquet', split="train", streaming=True)
8179
data_iter = enumerate(dataset)
82-
ip_indx,ip_num,_=get_ip_info()
83-
with Pool(processes=workers) as pool, tqdm(total=8.5e8, desc="copy") as bar:
84-
for _ in pool.imap_unordered(partial(parese_dataset,ip_indx=ip_indx,ip_num=ip_num,dst_dir=DEFAULT_DIRECTORY), data_iter):
80+
with Pool(processes=workers) as pool, tqdm(total=8.5e8, desc="parsing data") as bar:
81+
for _ in pool.imap_unordered(partial(parse_dataset,dst_dir=DEFAULT_DIRECTORY), data_iter):
8582
bar.update()
8683

8784
if __name__=="__main__":

examples/llava_ov_1_5/sample_packing/offline_packing_pipeline.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ docker exec -it "$CONTAINER_NAME" bash -c '
1717
run_python_script "2_do_hashbacket.py"
1818
run_python_script "3_s2_prepare_rawsamples-vqa.py"
1919
run_python_script "4_convert_packedsample_to_wds.py"
20-
run_python_script "5_make_mix_wds_config.py"
2120
2221
echo "─────────────────All processing workflows have been successfully completed.───────────────────"
2322
'

examples/llava_ov_1_5/sample_packing/tool.py

Lines changed: 3 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -5,52 +5,23 @@
55
import pickle
66
from tqdm import tqdm
77

8-
98
config='config.yaml'
109
with open(config, 'r', encoding='utf-8') as f:
1110
cfg = yaml.safe_load(f)
1211

13-
def get_ip_info(ip_file=cfg['hosts']):
14-
hostname = socket.gethostname()
15-
local_ip = socket.gethostbyname(hostname)
16-
ips=get_ips(ip_file)
17-
ip_indx=ips[local_ip]
18-
ip_num=len(ips)
19-
return ip_indx,ip_num,local_ip
20-
21-
def get_ips(ip_file):
22-
with open(ip_file, "r") as f:
23-
ip_list = [line.strip() for line in f if line.strip()]
24-
ip_list=sorted(ip_list)
25-
res={ip:i for i,ip in enumerate(ip_list)}
26-
return res
27-
28-
def get_split_datas_by_ips(ip_file,datas):
29-
ip_index,ip_num,_=get_ip_info(ip_file)
30-
num_datas=len(datas)
31-
step=num_datas//ip_num
32-
return datas[ip_index*step:(ip_index+1)*step] if ip_index<ip_num-1 else datas[ip_index*step:num_datas]
33-
3412
def get_init_file():
3513
MAX_TOKEN_LEN = cfg['sample']['max_len']
3614
big_dir = Path(cfg['data']['directory'])
37-
try:
38-
ip_index,_,_=get_ip_info(cfg['hosts'])
39-
print('SUNCCESS ->>>>>>>>>>>>')
40-
except:
41-
print(f"getting ip_index error, default to 0")
42-
ip_index=0
15+
ip_index=0
4316
DEFAULT_DIRECTORY=os.path.join(big_dir,f'part_{ip_index:02d}')
44-
print(DEFAULT_DIRECTORY)
4517
save_files_dir=os.path.join(DEFAULT_DIRECTORY,"save_files")
4618
os.makedirs(save_files_dir,exist_ok=True)
4719
OUTPUT_FILE = cfg['data']['output_base']
4820
TOKEN_INFO_FILE = cfg['data']['output_token']
49-
5021
OUTPUT_FILE=os.path.join(save_files_dir,OUTPUT_FILE)
5122
TOKEN_INFO_FILE=os.path.join(save_files_dir,TOKEN_INFO_FILE)
5223

53-
return TOKEN_INFO_FILE,MAX_TOKEN_LEN,save_files_dir,big_dir,DEFAULT_DIRECTORY
24+
return TOKEN_INFO_FILE,OUTPUT_FILE,MAX_TOKEN_LEN,save_files_dir,big_dir,DEFAULT_DIRECTORY
5425

5526
def get_num_boxs():
5627
pairs_dir=cfg['data']['directory']
@@ -63,10 +34,4 @@ def get_num_boxs():
6334
bin_boxes = pickle.load(f)
6435
box_num+=len(bin_boxes)
6536
sample_num+=sum([len(box) for box in bin_boxes])
66-
return box_num,sample_num
67-
68-
if __name__ == "__main__":
69-
file_path=os.path.join('/vlm/chengzheng/datasets/pdf_datas/part_00/save_files','bins_boxs.pkl')
70-
with open(file_path, 'rb') as f:
71-
bin_boxes = pickle.load(f)
72-
print(len(bin_boxes))
37+
return box_num,sample_num

0 commit comments

Comments
 (0)