import sys
import re


parents_map = {}

files_names_map = {}
dirs_names_map = {}

storage_file_ids = set()

#input_file = 'recovered_proper_full/recovered_full5_sorted_uniq_merged_local_media.csv'
input_file = sys.argv[1] #'recovered_proper_full/recovered_full7_sorted_uniq_local_media.csv'


def read_parents():
    with open(input_file, 'r') as f:
        for line in f:
            record = line.rstrip().split("\t")
            #edcd12c92d6ed0658a731016c259e703ch6c87,000001b6b707c1dd08bfd85ca07738bfch56d7,REG,938e2480995ff3d9549e7193f3e18f1fch342,20888116-main-0000-20888652-45965623.pdf,0,436,1652542036,False,12,9029729
            #comma_line = line.rstrip().replace("\t", ",")
            if len(record) != 11:
                #print(f'{comma_line}')
                continue

            space_id = record[0]
            if len(space_id) == 0:
                #print(f'!! MISSING SPACE ID IN: \n{line.rstrip()}\n')
                #print(f'{comma_line}')
                continue

            fileid = record[1]
            if len(fileid) == 0:
                #print(f'!! MISSING FILE ID IN: \n{line.rstrip()}\n')
                #print(f'{comma_line}')
                #print(f'!! MISSING SPACE ID IN: \n{line.rstrip()}\n')
                continue
            if fileid.startswith('trash_'):
                continue

            typ = record[2]
            if len(typ) == 0:
                #print(f'!! MISSING TYPE IN: \n{line.rstrip()}\n')
                #print(f'{comma_line}')
                continue

            parent_id = record[3]
            if len(parent_id) == 0:
                # print(f'!! MISSING PARENT ID IN: \n{line.rstrip()}\n')
                if not fileid.startswith('space_'):
                    print(f'{comma_line}')
                continue

            name = record[4]
            if len(name) == 0:
                #print(f'!! MISSING NAME IN: \n{line.rstrip()}\n')
                #print(f'{comma_line}')
                continue

            deleted = record[8]
            if len(deleted) == 0:
                #print(f'!! MISSING DELETED: \n{line.rstrip()}\n')
                #print(f'{comma_line}')
                continue
            if deleted == 'True':
                continue

            if typ == 'REG':
                files_names_map[fileid] = name
            else:
                dirs_names_map[fileid] = name

            parents_map[fileid] = parent_id

# ocr/bn-009c2/18981716/18981716-text-0008-25938440-60969000.txt
# ocr/bn-009c2/18983302/18983302-text-0003-25292594-59280865_ZcRBeua.txt
# ocr/bn-009c2/19170589/19170589-text-0002-31538997-78160725.txt
# ocr/bn-009c2/20130481/20130481-text-0000-22343353-50244058.txt
# ocr/bn-009c2/20130605/20130605-text-0001-22436011-50706307.txt
# ocr/bn-009c2/26079278/26079278-text-0004-27836764-66970069.txt
# ocr/bn-009c2/29664584/29664584-text-0000-37671328-96551802.txt
# ocr/bn-009c2/49261946/49261946-text-0002-52012032-143685086.txt
# ocr/bn-009c2/55598870/55598870-text-0005-62754170-174959911.txt
# ocr/bn-009c2/83267753/83267753-text-0001-84659453-240109920.txt
# ocr/bn-009c2/85672147/85672147-text-0001-100379216-296411997.txt
# ocr/bn-009c2/83267753/83267753-text-0001-84659453-240109920.txt
ocr_regex = re.compile(r'(\d+)-text')

# files/bn-009c2/52977775/thumb_52977775-main-0002-53065912-146066033.jpeg
# files/bn-009c2/55601830/thumb_55601830-main-0004-63112200-175242733.jpeg
# files/bn-009c2/5783711/thumb_5783711-main-0002-5783700-10605466.jpeg
# files/bn-009c2/60704447/thumb_60704447-main-0002-60835553-169132782.jpeg
# files/bn-009c2/6577596/thumb_6577596-main-0001-6577504-11919258.jpeg
# files/bn-009c2/6587578/thumb_6587578-main-0010-6587548-11929003.jpeg
# files/bn-009c2/6595126/6595126-main-0000-6595116-11936241.pdf
# files/bn-009c2/66471463/thumb_66471463-main-0003-66617858-185630609.jpeg
# files/bn-009c2/6650755/thumb_6650755-main-0003-6650729-11990373.jpeg
# files/bn-009c2/6751727/6751727-main-0000-6751722-12086505.pdf
# files/bn-009c2/6757066/thumb_6757066-main-0004-6757053-12091428.jpeg
# files/bn-009c2/7138338/thumb_7138338-main-0002-7138318-12457848.jpeg
# files/bn-009c2/7139039/thumb_7139039-main-0005-7139022-12458493.jpeg
# files/bn-009c2/9108516/thumb_9108516-main-0001-12837063-26262180.jpeg
filesbn_regex = re.compile(r'(\d+)-main')
filesbn_thumb_regex = re.compile(r'thumb_(\d+)-main')

# files/mnwk-ca6/193200/193200_1.jpg
# files/mnwk-ca6/279229/279229_0.jpg
# files/mnwk-ca6/320930/320930_1.jpg
# files/mnwk-ca6/406675/406675_0.jpg
# files/mnwk-ca6/434204/434204_1.jpg
# files/mnwk-ca6/456187/456187_1.jpg
# files/mnwk-ca6/469910/469910_3.jpg
# files/mnwk-ca6/497637/497637_1.jpg
# files/mnwk-ca6/66181/66181_0.jpg
filesmnwk_regex = re.compile(r'(\d+)_\d+')

# files/nac-75d4/617588/617588_a5e3e5fa45dbcbb4f3e3d195c5cfbad3c9cc9022b6a03a09f3c78d27063_g2RWuUK.jpg
# files/nac-75d4/681823/681823_eb490f96f752fedfae35006cd2c07a6bb37d1f30983cb0b174f7de16f10_YqJT2uJ.jpg
# files/nac-75d4/697283/697283_eed71b9ac9f71d2b6d36b2f8b451ed0d2955ddb232f7ed19e8d1e4000d6_yYvkz8T.jpg
# files/nac-75d4/711129/711129_eed71b9ac9f71d2b6d36b2f8b451ed0d2955ddb232f7ed19e8d1e4000d6_N3j3RPe.jpg
# files/nac-75d4/712116/712116_eed71b9ac9f71d2b6d36b2f8b451ed0d2955ddb232f7ed19e8d1e4000d6_QbbacKw.jpg
# files/nac-75d4/735579/735579_eed71b9ac9f71d2b6d36b2f8b451ed0d2955ddb232f7ed19e8d1e4000d6_XmBo2Kb.jpg
# files/nac-75d4/87162/87162_eed71b9ac9f71d2b6d36b2f8b451ed0d2955ddb232f7ed19e8d1e4000d6f9_7jKAH5K.jpg
# files/nac-75d4/893167/893167_eed71b9ac9f71d2b6d36b2f8b451ed0d2955ddb232f7ed19e8d1e4000d6_IGhB1yx.jpg
# files/nac-75d4/92558/92558_eed71b9ac9f71d2b6d36b2f8b451ed0d2955ddb232f7ed19e8d1e4000d6f9_Yr5Um4m.jpg
filesnac_regex = re.compile(r'(\d+)_[0-9a-zA-Z]+_')



def generate_path_from_name(name):
    # files/nac-75d4/92558/92558_eed71b9ac9f71d2b6d36b2f8b451ed0d2955ddb232f7ed19e8d1e4000d6f9_Yr5Um4m.jpg
    # files/mnwk-ca6/66181/66181_0.jpg
    #files/bn-009c2/20124262/thumb_20124262-main-0004-24824237-57729647.jpeg

    # print(f"@@ CHECKING REGEX FOR {name}")

    m = ocr_regex.findall(name)
    if len(m) == 1:
        #        ocr/bn-009c2/83267753/83267753-text-0001-84659453-240109920.txt
        return f'ocr/bn-009c2/{m[0]}/{name}'

    m = filesbn_regex.findall(name)
    if len(m) == 1:
        #        files/bn-009c2/6751727/6751727-main-0000-6751722-12086505.pdf
        return f'files/bn-009c2/{m[0]}/{name}'

    m = filesbn_thumb_regex.findall(name)
    if len(m) == 1:
        #        files/bn-009c2/18978882/thumb_18978882-main-0002-19999614-43339284.jpeg
        return f'files/bn-009c2/{m[0]}/{name}'

    m = filesnac_regex.findall(name)
    if len(m) == 1:
        #        files/nac-75d4/469910/469910_3.jpg
        return f'files/nac-75d4/{m[0]}/{name}'

    m = filesmnwk_regex.findall(name)
    if len(m) == 1:
        #        files/mnwk-ca6/469910/469910_3.jpg
        return f'files/mnwk-ca6/{m[0]}/{name}'

    return None
    #print(f"=== FAILED TO PARSE {name}")


def map_fileids_to_paths():
    #missing = open('recovered_paths_missing_files-10.txt', 'w')

    for fileid, name in files_names_map.items():
        parents = []

        # Skip not missing
        #if fileid not in storage_file_ids:
        #    continue

        # print(f"@@ PROCESSING {name}")
        if parents_map[fileid] not in dirs_names_map:
            # Generate file path from filename directly
            generated_path = generate_path_from_name(name)
            if generated_path:
                print(f"{fileid}\t{generated_path}")

        last_parent_id = parents_map[fileid]

        if last_parent_id not in dirs_names_map:
            # Generate file path from filename directly
            generated_path = generate_path_from_name(name)
            if generated_path:
                print(f"{fileid}\t{generated_path}")
        else:
            parents.append(dirs_names_map[last_parent_id])

        # Only output files with generated paths from name
        #    continue


        while last_parent_id:
            if last_parent_id.startswith("space_"):
                # GOT IT
                parents = list(reversed(parents))
                #parents.pop(0)
                parents.append(name)
                if len(parents) > 0:
                    print(f"{fileid}\t{'/'.join(parents)}")
                last_parent_id = None
            else:
                last_parent_id = parents_map.get(last_parent_id, None)
                if last_parent_id:
                    if last_parent_id.startswith("space_"):
                        continue
                    elif last_parent_id not in dirs_names_map:
                        # Generate file path from filename directly
                        generated_path = generate_path_from_name(name)
                        if generated_path:
                            print(f"{fileid}\t{generated_path}")
                        #missing.write(f'{fileid}\n')
                        last_parent_id = None
                    else:
                        parents.append(dirs_names_map[last_parent_id])

    #missing.close()

if __name__ == '__main__':
    missing_ids_file = sys.argv[2]
    with open(missing_ids_file, 'r') as f:
        for line in f.readlines():
            storage_file_ids.add(line.rstrip())

    #print(f"Loading index from ")
    read_parents()
    #print(f"Found {len(parents_map)} parent mappings and {len(files_names_map)} file name mappings")
    #map_fileids_to_paths()



