boq-classification/GPT.py at master · PuddingPotato/boq-classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import requests
import json
import pandas as pd
import os
import re
from sklearn.metrics import accuracy_score
import time

current_directory = os.path.dirname(os.path.abspath("__file__"))

url = "POST_URL"

headers = {
    'Api-Key': 'API_KEY',
}

file_list = os.listdir(rf'{current_directory}\Extracted data_gz')

def extract_number(filename):
    match = re.search(r'(\d+)', filename)
    return int(match.group(0)) if match else 0

def GPT(folder_path):
    sorted_list = sorted(folder_path, key=extract_number)

    for i in sorted_list:
        pred_cat = []
        reasons = []
        count = 0
        cat_true = []
        accuracy = []
        incorrect_answer_index = []
        df = pd.read_csv(rf'{current_directory}\Extracted data_gz\{i}', compression = 'gzip')

        df = df.fillna('อื่นๆ')
        # df = df.loc[0:4]
        print(f'Reading {i}')
        print(f'Total record : {len(df)}\n')

        xlsx_count = len(os.listdir(rf'{current_directory}\GPT Outputs'))
        newtovec = df.apply(lambda row: f'หมวด BOQ : {row['หมวด BOQ']}, ชื่องาน : {row['ชื่องาน']}, ชื่องานย่อย : {row['ชื่องานย่อย']}, รายการ : {row['รายการ']}, หน่วย : {row['หน่วย']} อยู่ในกลุ่มรายการอะไร', axis=1).tolist()

        for j in newtovec:
            # j = input('Question: ')
            if j != "":
                data = {
                    "events": [
                        {
                            "type": 1,
                            "timestamp": 1701821530,
                            "source": {
                                "userId": "xxxxx10"
                            },
                            "message": {
                                "text": f"{j}",
                                "prompttext": "You are an expert in categorizing items. Based on the descriptions provided, please classify each item into one of the predefined categories listed below. Respond with the name of the category and a very brief explanation of your reasoning. The response should be in Thai. You must only classify an item into a category if the information exactly matches the conditions described. You must not respond with any category name other than those listed below. Response Format: 'Answer: {กลุ่มรายการ}, Reason: {เหตุผลสั้นที่สุด}'Predefined Categories: 1.กำไร 2.ภาษี 3.งานเตรียมการ 4.ระบบป้องกันดินพัง 5.คอนกรีต 6.ไม้แบบ 7.เหล็ก 8.ข้อต่อเกลียว 9.งานปูกระเบื้องพื้น 10.งานปูกระเบื้องยาง 11.เตรียมผิวกระเบื้องยาง/SPC 12.ผนังก่ออิฐมวลเบา 13.ผนัง Precast 14.ผนังฉาบปูนเรียบภายใน 15.ผนังฉาบปูนเรียบภายนอก 16.งานปูกระเบื้องผนัง 17.เตรียมผิวผนัง 18.ฝ้าค.ส.ล.ผิวฉาบเรียบ 19.ฝ้ายิบซั่มฉาบเรียบ 20.ฝ้ายิบซั่มฉาบเรียบกันชื้น 21.ฝ้ากล่อง 22.ประตูเหล็ก 23.ประตูโดยศุภาลัย 24.งานกระจก 25.งานทาสีภายใน 26.งานทาสีรองพื้น 27.งานทาสี TEXTURE 28.งานทาสีภายนอก 29.งานทาสีท่องานระบบ 30.งานราวระเบียง 31.งานแผงบังแอร์ 32.ผนังก่ออิฐมอญ 33.ประตูอื่นๆ โดยผรม. (ครบชุด) 34.เตรียมผิวปาร์เก้ 35.อื่นๆ"
                            }
                        }
                    ]
                }

                json_data = json.dumps(data)

                response = requests.post(url, headers=headers, data=json_data)

                if response.status_code == 200:
                    #print('Request successful!')
                    #print('Response:', response.json())
                    cat_true.append(df['กลุ่มรายการ'][count])
                    count += 1
                    ans = response.json().split(', Reason: ')
                    pred_cat.append(ans[0].replace('Answer: ', ''))
                    reasons.append(ans[1])
                    acc = round(accuracy_score(cat_true, pred_cat)*100, 3)
                    accuracy.append(acc)

                    if len(accuracy) > 0:
                        if acc < accuracy[-1]:
                            incorrect_answer_index.append(len(accuracy))

                    print(f'{i} col: {count}/{len(df)} Accuracy: {acc} [{len(pred_cat)}, {len(reasons)}, {len(accuracy)}]')
                    print(f'Question : {j}')
                    print(f'{str(response.json())} \n')

                else:
                    print('Request failed.')
                    print('Status code:', response.status_code)
                    print('Response:', response.text)

            else:
                pred_cat.append("")
            time.sleep(1)

        df['BOQClassifier'] = pred_cat
        df['BOQClassifier_Reason'] = reasons
        df['BOQClassifier_Accuracy'] = accuracy
        incorrect_answer = df.iloc[incorrect_answer_index]
        xlsx_count += 1
        incorrect_answer.to_csv(rf'{current_directory}\GPTIncorrect_ans\gptoutput_{xlsx_count}.gz', index = False, compression = 'gzip')
        df.to_csv(rf'{current_directory}\GPT Outputs\gptoutput_{xlsx_count}.gz', index = False, compression = 'gzip')
        print('\n' + rf'Saved {current_directory}\GPT Outputs\gptoutput_{xlsx_count}.gz')
        os.remove(rf'{current_directory}\Extracted data\{i}')
        print(rf'Removed {current_directory}\Extracted data\{i}' + '\n')