import torch
from torch.utils.data import DataLoader,Dataset
import os
import pandas as pd
# 没有word embedding的dataset和dataloader

def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

# 三部分：1、model  2、dataset  3、train
# word_to_ix = {}
# for sentence, tags in training_data:
#     for word in sentence:
#         if word not in word_to_ix:
#             word_to_ix[word] = len(word_to_ix)
class MyNerDataset(Dataset):
    def __init__(self,root,size):
        self.root = root
        self.path = os.path.join("data",self.root+".txt")
        self.data = pd.read_csv(self.path,header=None,delimiter=" ")
        self.word_to_idx = {}
        self.training_data = []
        self.ls_text=[]
        self.ls_label=[]
        i=0
        for index, val in self.data.iterrows():
            if val[0] != "。":
                self.ls_text.append(val[0])
                self.ls_label.append(val[1])
            if val[0] == "。":
                self.ls_text.append(val[0])
                self.ls_label.append(val[1])
                self.training_data.append((self.ls_text, self.ls_label))
                self.ls_text = []
                self.ls_label = []
                i+=1
            if i==size:
                break

        for i,j in self.training_data:
            for w in i:
                if w not in self.word_to_idx:
                    self.word_to_idx[w] = len(self.word_to_idx)


    def __getitem__(self, index):
        return self.training_data[index]


    def __len__(self):
        return len(self.training_data)


def load_data(root,batch_size):
    ds = MyNerDataset(root,batch_size)
    dl = DataLoader(dataset=ds, batch_size=batch_size,num_workers=0)
    return dl,ds
