traveler514 commited on
Commit
81a794d
·
0 Parent(s):

first commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ **/tokenizer.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # PyPI configuration file
171
+ .pypirc
172
+
173
+ .vscode/
174
+ evaluate/result/
README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: PATCHOULI
3
+ pinned: true
4
+ sdk: gradio
5
+ ---
6
+
7
+ # PATCHOULI
8
+
9
+ PATCHOULI (Patch Observing and Untangling Engine) is an easy-to-use software security patch analyzing tool.
app.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import logging
4
+ import gradio as gr
5
+
6
+ from backend.inference import section_infer, cwe_infer, PREDEF_MODEL_MAP, LOCAL_MODEL_PEFT_MAP, PREDEF_CWE_MODEL
7
+
8
+ APP_TITLE = "PATCHOULI"
9
+
10
+ STYLE_APP_TITLE = '<div style="text-align: center; font-weight: bold; font-family: Arial, sans-serif; font-size: 44px;">' + \
11
+ '<span style="color: #14e166">PATCH</span> ' + \
12
+ '<span style="color: #14e166">O</span>bserving ' + \
13
+ 'and ' + \
14
+ '<span style="color: #14e166">U</span>ntang<span style="color: #14e166">l</span>ing ' + \
15
+ 'Eng<span style="color: #14e166">i</span>ne' + \
16
+ '</div>'
17
+
18
+ # from 0.00 to 1.00, 41 colors
19
+ NONVUL_GRADIENT_COLORS = ["#d3f8d6",
20
+ "#d3f8d6", "#d0f8d3", "#ccf7d0", "#c9f7cd", "#c6f6cb", "#c2f6c8", "#bff5c5", "#bcf5c2", "#b8f4bf", "#b5f4bc",
21
+ "#b1f3ba", "#aef2b7", "#aaf2b4", "#a7f1b1", "#a3f1ae", "#9ff0ab", "#9cf0a9", "#98efa6", "#94efa3", "#90eea0",
22
+ "#8ced9d", "#88ed9a", "#84ec98", "#80ec95", "#7ceb92", "#78ea8f", "#73ea8c", "#6fe989", "#6ae886", "#65e883",
23
+ "#60e781", "#5ae67e", "#55e67b", "#4fe578", "#48e475", "#41e472", "#39e36f", "#30e26c", "#25e269", "#14e166"
24
+ ]
25
+
26
+ # from 0.00 to 1.00, 41 colors
27
+ VUL_GRADIENT_COLORS = ["#d3f8d6",
28
+ "#fdcfc9", "#fdccc5", "#fcc9c2", "#fcc5bf", "#fcc2bb", "#fbbfb8", "#fbbcb4", "#fab9b1", "#fab5ad", "#f9b2aa",
29
+ "#f8afa7", "#f8aca3", "#f7a8a0", "#f7a59c", "#f6a299", "#f59f96", "#f59c92", "#f4988f", "#f3958c", "#f29288",
30
+ "#f18e85", "#f18b82", "#f0887f", "#ef847c", "#ee8178", "#ed7e75", "#ec7a72", "#eb776f", "#ea736c", "#e97068",
31
+ "#e86c65", "#e76962", "#e6655f", "#e5615c", "#e45e59", "#e35a56", "#e25653", "#e05250", "#df4e4d", "#de4a4a"
32
+ ]
33
+
34
+
35
+ logging.basicConfig(level=logging.INFO,
36
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
37
+ logging.getLogger("httpx").setLevel(logging.WARNING)
38
+
39
+
40
+ def generate_color_map():
41
+ color_map = {}
42
+ for i in range(0, 101):
43
+ color_map[f"non-vul-fixing: {i/100:0.2f}"] = NONVUL_GRADIENT_COLORS[int(i * 0.4)]
44
+ color_map[f"vul-fixing: {i/100:0.2f}"] = VUL_GRADIENT_COLORS[int(i * 0.4)]
45
+ return color_map
46
+
47
+
48
+ def on_submit(diff_code, patch_message, cwe_model, section_model_type, progress = gr.Progress(track_tqdm=True), *model_config):
49
+ if diff_code == "":
50
+ return gr.skip(), gr.skip(), gr.skip()
51
+
52
+ try:
53
+ section_results = section_infer(diff_code, patch_message, section_model_type, *model_config)
54
+ except Exception as e:
55
+ raise gr.Error(f"Error: {str(e)}")
56
+
57
+ vul_cnt = 0
58
+ for file_results in section_results.values():
59
+ for item in file_results:
60
+ if item["predict"] == 1:
61
+ vul_cnt += 1
62
+ label_text = f"Vul-fixing patch" if vul_cnt > 0 \
63
+ else f"Non-vul-fixing patch"
64
+ color = "#de4a4a" if vul_cnt > 0 else "#14e166"
65
+ patch_category_label = gr.Label(value = label_text, color = color)
66
+
67
+ if cwe_model == "":
68
+ cwe_cls_result = "No model selected"
69
+ elif vul_cnt == 0:
70
+ cwe_cls_result = "No vulnerability found"
71
+ else:
72
+ cwe_cls_result = cwe_infer(diff_code, patch_message, cwe_model)
73
+
74
+ return patch_category_label, section_results, cwe_cls_result
75
+
76
+
77
+ with gr.Blocks(title = APP_TITLE, fill_width=True) as demo:
78
+
79
+ section_results_state = gr.State({})
80
+ cls_results_state = gr.State({})
81
+
82
+ title = gr.HTML(STYLE_APP_TITLE)
83
+
84
+ with gr.Row() as main_block:
85
+
86
+ with gr.Column(scale=1) as input_block:
87
+ diff_codebox = gr.Code(label="Input git diff here", max_lines=10)
88
+
89
+ with gr.Accordion("Patch message (optional)", open=False):
90
+ message_textbox = gr.Textbox(label="Patch message", placeholder="Enter patch message here", container=False, lines=2, max_lines=5)
91
+
92
+ cwe_model_selector = gr.Dropdown(PREDEF_CWE_MODEL, label="Select vulnerability type classifier", allow_custom_value=True)
93
+
94
+ with gr.Tabs(selected=0) as model_type_tabs:
95
+ MODEL_TYPE_NAMES = list(PREDEF_MODEL_MAP.keys())
96
+ with gr.Tab(MODEL_TYPE_NAMES[0]) as local_llm_tab:
97
+ local_model_selector = gr.Dropdown(PREDEF_MODEL_MAP[MODEL_TYPE_NAMES[0]], label="Select model", allow_custom_value=True)
98
+ local_peft_selector = gr.Dropdown(LOCAL_MODEL_PEFT_MAP[local_model_selector.value], label="Select PEFT model (optional)", allow_custom_value=True)
99
+ local_submit_btn = gr.Button("Run", variant="primary")
100
+ with gr.Tab(MODEL_TYPE_NAMES[1]) as online_llm_tab:
101
+ online_model_selector = gr.Dropdown(PREDEF_MODEL_MAP[MODEL_TYPE_NAMES[1]], label="Select model", allow_custom_value=True)
102
+ online_api_url_textbox = gr.Textbox(label="API URL")
103
+ online_api_key_textbox = gr.Textbox(label="API Key", placeholder="We won't store your API key", value=os.getenv("ONLINE_API_KEY"), type="password")
104
+ online_submit_btn = gr.Button("Run", variant="primary")
105
+
106
+ section_model_type = gr.State(model_type_tabs.children[0].label)
107
+
108
+ with gr.Accordion("Load examples", open=False):
109
+ with open("./backend/examples.json", "r") as f:
110
+ examples = json.load(f)
111
+ gr.Button("Load example 1", size='sm').click(lambda : examples[0], outputs=[diff_codebox, message_textbox])
112
+ gr.Button("Load example 2", size='sm').click(lambda : examples[1], outputs=[diff_codebox, message_textbox])
113
+ gr.Button("Load example 3", size='sm').click(lambda : examples[2], outputs=[diff_codebox, message_textbox])
114
+
115
+ with gr.Column(scale=2) as section_result_block:
116
+ @gr.render(inputs=section_results_state, triggers=[section_results_state.change, demo.load])
117
+ def display_result(section_results):
118
+ if not section_results or len(section_results) == 0:
119
+ with gr.Tab("File tabs"):
120
+ gr.Markdown("No results")
121
+ else:
122
+ for file_name, file_results in section_results.items():
123
+ with gr.Tab(file_name) as file_tab:
124
+ highlited_results = []
125
+ full_color_map = generate_color_map()
126
+ this_color_map = {}
127
+ for item in file_results:
128
+ predict_result = {-1: 'error', 0: 'non-vul-fixing', 1: 'vul-fixing'}
129
+ text_label = f"{predict_result[item['predict']]}: {item['conf']:0.2f}"
130
+ this_color_map[text_label] = full_color_map[text_label]
131
+ highlited_results.append((
132
+ item["section"],
133
+ text_label
134
+ ))
135
+ gr.HighlightedText(
136
+ highlited_results,
137
+ label="Results",
138
+ color_map=this_color_map
139
+ )
140
+
141
+ with gr.Column(scale=1) as result_block:
142
+ patch_category_label = gr.Label(value = "No results", label = "Result of the whole patch")
143
+ def update_vul_type_label(cls_results):
144
+ return gr.Label(cls_results)
145
+ vul_type_label = gr.Label(update_vul_type_label, label = "Possible fixed vulnerability type", inputs = [cls_results_state])
146
+
147
+
148
+ def update_model_type_state(evt: gr.SelectData):
149
+ return evt.value
150
+ model_type_tabs.select(update_model_type_state, outputs = [section_model_type])
151
+
152
+ def update_support_peft(base_model):
153
+ return gr.Dropdown(LOCAL_MODEL_PEFT_MAP[base_model], value = LOCAL_MODEL_PEFT_MAP[base_model][0])
154
+ local_model_selector.change(update_support_peft, inputs=[local_model_selector], outputs = [local_peft_selector])
155
+
156
+ local_submit_btn.click(fn = on_submit,
157
+ inputs = [diff_codebox, message_textbox, cwe_model_selector, section_model_type, local_model_selector, local_peft_selector],
158
+ outputs = [patch_category_label, section_results_state, cls_results_state])
159
+ online_submit_btn.click(fn = on_submit,
160
+ inputs = [diff_codebox, message_textbox, cwe_model_selector, section_model_type, online_model_selector, online_api_url_textbox, online_api_key_textbox],
161
+ outputs = [patch_category_label, section_results_state, cls_results_state])
162
+
163
+
164
+ demo.launch()
backend/cwe_infer_helper.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from collections import defaultdict
3
+ from transformers import pipeline, AutoTokenizer
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ class CweInferHelper():
8
+
9
+ TOP_K = 5
10
+ MAX_LENGTH = 1024
11
+
12
+ MODEL_CONFIG = defaultdict(lambda model: {
13
+ "model_name_or_path": model,
14
+ "tokenizer_name": model
15
+ })
16
+
17
+ MODEL_CONFIG.update({
18
+ "patchouli-cwe-UniXcoder": {
19
+ "model_name_or_path": "./backend/model/cwe-cls/patchouli-unixcoder",
20
+ "tokenizer_name": "microsoft/unixcoder-base-nine"
21
+ }
22
+ })
23
+
24
+ PREDEF_MODEL = list(MODEL_CONFIG.keys())
25
+
26
+ def __init__(self):
27
+ self.model = None
28
+ self.classifier = None
29
+ self.tokenizer = None
30
+
31
+
32
+ def load_model(self, model):
33
+ logger.info(f"Loading CWE classify model: {model}")
34
+ if model == self.model:
35
+ return
36
+ self.model = model
37
+ model_name_or_path = self.MODEL_CONFIG[model]["model_name_or_path"]
38
+ tokenizer_name = self.MODEL_CONFIG[model]["tokenizer_name"]
39
+ self.classifier = pipeline("text-classification", model=model_name_or_path, tokenizer=tokenizer_name, device_map="auto")
40
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
41
+
42
+
43
+
44
+ def infer(self, diff_code, patch_message = None):
45
+ if self.classifier is None:
46
+ raise ValueError("Model is not loaded")
47
+ input_text = ""
48
+ if patch_message is not None and patch_message != "":
49
+ input_text += f"[MESSAGE]\n{patch_message}\n"
50
+ input_text += f"[PATCH]\n{diff_code}"
51
+ logger.info(f"Classifying CWE for diff code")
52
+ input_ids = self.tokenizer(input_text, max_length=CweInferHelper.MAX_LENGTH-10, padding="max_length", truncation=True).input_ids
53
+ input_text = self.tokenizer.decode(input_ids)
54
+ result = self.classifier(input_text, top_k = self.TOP_K)
55
+ result = {item["label"]: item["score"] for item in result}
56
+ return result
57
+
58
+
59
+ cwe_infer_helper = CweInferHelper()
60
+
61
+
62
+ if __name__ == "__main__":
63
+ code = """diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
64
+ index 6bde12da2fe003..c37ac2d7bec44d 100644
65
+ --- a/net/netfilter/ipvs/ip_vs_ctl.c
66
+ +++ b/net/netfilter/ipvs/ip_vs_ctl.c
67
+ @@ -2077,6 +2077,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
68
+ if (!capable(CAP_NET_ADMIN))
69
+ return -EPERM;
70
+
71
+ + if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
72
+ + return -EINVAL;
73
+ + if (len < 0 || len > MAX_ARG_LEN)
74
+ + return -EINVAL;
75
+ if (len != set_arglen[SET_CMDID(cmd)]) {
76
+ pr_err("set_ctl: len %u != %u\n",
77
+ len, set_arglen[SET_CMDID(cmd)]);
78
+ @@ -2352,17 +2356,25 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
79
+ {
80
+ unsigned char arg[128];
81
+ int ret = 0;
82
+ + unsigned int copylen;
83
+
84
+ if (!capable(CAP_NET_ADMIN))
85
+ return -EPERM;
86
+
87
+ + if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
88
+ + return -EINVAL;
89
+ +
90
+ if (*len < get_arglen[GET_CMDID(cmd)]) {
91
+ pr_err("get_ctl: len %u < %u\n",
92
+ *len, get_arglen[GET_CMDID(cmd)]);
93
+ return -EINVAL;
94
+ }
95
+
96
+ - if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
97
+ + copylen = get_arglen[GET_CMDID(cmd)];
98
+ + if (copylen > 128)
99
+ + return -EINVAL;
100
+ +
101
+ + if (copy_from_user(arg, user, copylen) != 0)
102
+ return -EFAULT;
103
+
104
+ if (mutex_lock_interruptible(&__ip_vs_mutex))
105
+ """
106
+ cwe_infer_helper.load_model("patchouli")
107
+ result = cwe_infer_helper.infer(code)
108
+ print(result)
109
+
backend/examples.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ [
3
+ "diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c\nindex ec83f413a7ed19..88a565f130a5a2 100644\n--- a/net/wireless/nl80211.c\n+++ b/net/wireless/nl80211.c\n@@ -3406,12 +3406,12 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)\n \ti = 0;\n \tif (info->attrs[NL80211_ATTR_SCAN_SSIDS]) {\n \t\tnla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp) {\n+\t\t\trequest->ssids[i].ssid_len = nla_len(attr);\n \t\t\tif (request->ssids[i].ssid_len > IEEE80211_MAX_SSID_LEN) {\n \t\t\t\terr = -EINVAL;\n \t\t\t\tgoto out_free;\n \t\t\t}\n \t\t\tmemcpy(request->ssids[i].ssid, nla_data(attr), nla_len(attr));\n-\t\t\trequest->ssids[i].ssid_len = nla_len(attr);\n \t\t\ti++;\n \t\t}\n \t}\n@@ -3572,6 +3572,7 @@ static int nl80211_start_sched_scan(struct sk_buff *skb,\n \tif (info->attrs[NL80211_ATTR_SCAN_SSIDS]) {\n \t\tnla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS],\n \t\t\t\t tmp) {\n+\t\t\trequest->ssids[i].ssid_len = nla_len(attr);\n \t\t\tif (request->ssids[i].ssid_len >\n \t\t\t IEEE80211_MAX_SSID_LEN) {\n \t\t\t\terr = -EINVAL;\n@@ -3579,7 +3580,6 @@ static int nl80211_start_sched_scan(struct sk_buff *skb,\n \t\t\t}\n \t\t\tmemcpy(request->ssids[i].ssid, nla_data(attr),\n \t\t\t nla_len(attr));\n-\t\t\trequest->ssids[i].ssid_len = nla_len(attr);\n \t\t\ti++;\n \t\t}\n \t}",
4
+ "nl80211: fix check for valid SSID size in scan operations\nIn both trigger_scan and sched_scan operations, we were checking for\nthe SSID length before assigning the value correctly. Since the\nmemory was just kzalloc'ed, the check was always failing and SSID with\nover 32 characters were allowed to go through.\n\nThis was causing a buffer overflow when copying the actual SSID to the\nproper place.\n\nThis bug has been there since 2.6.29-rc4.\n\nCc: [email protected]\nSigned-off-by: Luciano Coelho <[email protected]>\nSigned-off-by: John W. Linville <[email protected]>"
5
+ ],
6
+ [
7
+ "diff --git a/src/include/utils/datetime.h b/src/include/utils/datetime.h\nindex 12f1e7753c..b10648269f 100644\n--- a/src/include/utils/datetime.h\n+++ b/src/include/utils/datetime.h\n@@ -188,12 +188,17 @@ struct tzEntry;\n #define DTK_DATE_M\t\t(DTK_M(YEAR) | DTK_M(MONTH) | DTK_M(DAY))\n #define DTK_TIME_M\t\t(DTK_M(HOUR) | DTK_M(MINUTE) | DTK_ALL_SECS_M)\n \n-#define MAXDATELEN\t\t63\t\t/* maximum possible length of an input date\n-\t\t\t\t\t\t\t\t * string (not counting tr. null) */\n-#define MAXDATEFIELDS\t25\t\t/* maximum possible number of fields in a date\n-\t\t\t\t\t\t\t\t * string */\n-#define TOKMAXLEN\t\t10\t\t/* only this many chars are stored in\n-\t\t\t\t\t\t\t\t * datetktbl */\n+/*\n+ * Working buffer size for input and output of interval, timestamp, etc.\n+ * Inputs that need more working space will be rejected early. Longer outputs\n+ * will overrun buffers, so this must suffice for all possible output. As of\n+ * this writing, interval_out() needs the most space at ~90 bytes.\n+ */\n+#define MAXDATELEN\t\t128\n+/* maximum possible number of fields in a date string */\n+#define MAXDATEFIELDS\t25\n+/* only this many chars are stored in datetktbl */\n+#define TOKMAXLEN\t\t10\n \n /* keep this struct small; it gets used a lot */\n typedef struct\ndiff --git a/src/interfaces/ecpg/pgtypeslib/datetime.c b/src/interfaces/ecpg/pgtypeslib/datetime.c\nindex 6600759220..a271cdd7d1 100644\n--- a/src/interfaces/ecpg/pgtypeslib/datetime.c\n+++ b/src/interfaces/ecpg/pgtypeslib/datetime.c\n@@ -60,14 +60,14 @@ PGTYPESdate_from_asc(char *str, char **endptr)\n \tint\t\t\tnf;\n \tchar\t *field[MAXDATEFIELDS];\n \tint\t\t\tftype[MAXDATEFIELDS];\n-\tchar\t\tlowstr[MAXDATELEN + 1];\n+\tchar\t\tlowstr[MAXDATELEN + MAXDATEFIELDS];\n \tchar\t *realptr;\n \tchar\t **ptr = (endptr != NULL) ? endptr : &realptr;\n \n \tbool\t\tEuroDates = FALSE;\n \n \terrno = 0;\n-\tif (strlen(str) >= sizeof(lowstr))\n+\tif (strlen(str) > MAXDATELEN)\n \t{\n \t\terrno = PGTYPES_DATE_BAD_DATE;\n \t\treturn INT_MIN;\ndiff --git a/src/interfaces/ecpg/pgtypeslib/dt.h b/src/interfaces/ecpg/pgtypeslib/dt.h\nindex d7a1935516..3a50d1410e 100644\n--- a/src/interfaces/ecpg/pgtypeslib/dt.h\n+++ b/src/interfaces/ecpg/pgtypeslib/dt.h\n@@ -192,12 +192,17 @@ typedef double fsec_t;\n #define DTK_DATE_M\t\t(DTK_M(YEAR) | DTK_M(MONTH) | DTK_M(DAY))\n #define DTK_TIME_M\t\t(DTK_M(HOUR) | DTK_M(MINUTE) | DTK_M(SECOND))\n \n-#define MAXDATELEN\t\t63\t\t/* maximum possible length of an input date\n-\t\t\t\t\t\t\t\t * string (not counting tr. null) */\n-#define MAXDATEFIELDS\t25\t\t/* maximum possible number of fields in a date\n-\t\t\t\t\t\t\t\t * string */\n-#define TOKMAXLEN\t\t10\t\t/* only this many chars are stored in\n-\t\t\t\t\t\t\t\t * datetktbl */\n+/*\n+ * Working buffer size for input and output of interval, timestamp, etc.\n+ * Inputs that need more working space will be rejected early. Longer outputs\n+ * will overrun buffers, so this must suffice for all possible output. As of\n+ * this writing, PGTYPESinterval_to_asc() needs the most space at ~90 bytes.\n+ */\n+#define MAXDATELEN\t\t128\n+/* maximum possible number of fields in a date string */\n+#define MAXDATEFIELDS\t25\n+/* only this many chars are stored in datetktbl */\n+#define TOKMAXLEN\t\t10\n \n /* keep this struct small; it gets used a lot */\n typedef struct\ndiff --git a/src/interfaces/ecpg/pgtypeslib/dt_common.c b/src/interfaces/ecpg/pgtypeslib/dt_common.c\nindex 112538ed50..c5d91ed922 100644\n--- a/src/interfaces/ecpg/pgtypeslib/dt_common.c\n+++ b/src/interfaces/ecpg/pgtypeslib/dt_common.c\n@@ -1171,15 +1171,22 @@ DecodeNumberField(int len, char *str, int fmask,\n \tif ((cp = strchr(str, '.')) != NULL)\n \t{\n #ifdef HAVE_INT64_TIMESTAMP\n-\t\tchar\t\tfstr[MAXDATELEN + 1];\n+\t\tchar\t\tfstr[7];\n+\t\tint\t\t\ti;\n+\n+\t\tcp++;\n \n \t\t/*\n \t\t * OK, we have at most six digits to care about. Let's construct a\n-\t\t * string and then do the conversion to an integer.\n+\t\t * string with those digits, zero-padded on the right, and then do\n+\t\t * the conversion to an integer.\n+\t\t *\n+\t\t * XXX This truncates the seventh digit, unlike rounding it as do\n+\t\t * the backend and the !HAVE_INT64_TIMESTAMP case.\n \t\t */\n-\t\tstrcpy(fstr, (cp + 1));\n-\t\tstrcpy(fstr + strlen(fstr), \"000000\");\n-\t\t*(fstr + 6) = '\\0';\n+\t\tfor (i = 0; i < 6; i++)\n+\t\t\tfstr[i] = *cp != '\\0' ? *cp++ : '0';\n+\t\tfstr[i] = '\\0';\n \t\t*fsec = strtol(fstr, NULL, 10);\n #else\n \t\t*fsec = strtod(cp, NULL);\n@@ -1531,15 +1538,22 @@ DecodeTime(char *str, int *tmask, struct tm * tm, fsec_t *fsec)\n \t\telse if (*cp == '.')\n \t\t{\n #ifdef HAVE_INT64_TIMESTAMP\n-\t\t\tchar\t\tfstr[MAXDATELEN + 1];\n+\t\t\tchar\t\tfstr[7];\n+\t\t\tint\t\t\ti;\n+\n+\t\t\tcp++;\n \n \t\t\t/*\n-\t\t\t * OK, we have at most six digits to work with. Let's construct a\n-\t\t\t * string and then do the conversion to an integer.\n+\t\t\t * OK, we have at most six digits to care about. Let's construct a\n+\t\t\t * string with those digits, zero-padded on the right, and then do\n+\t\t\t * the conversion to an integer.\n+\t\t\t *\n+\t\t\t * XXX This truncates the seventh digit, unlike rounding it as do\n+\t\t\t * the backend and the !HAVE_INT64_TIMESTAMP case.\n \t\t\t */\n-\t\t\tstrncpy(fstr, (cp + 1), 7);\n-\t\t\tstrcpy(fstr + strlen(fstr), \"000000\");\n-\t\t\t*(fstr + 6) = '\\0';\n+\t\t\tfor (i = 0; i < 6; i++)\n+\t\t\t\tfstr[i] = *cp != '\\0' ? *cp++ : '0';\n+\t\t\tfstr[i] = '\\0';\n \t\t\t*fsec = strtol(fstr, &cp, 10);\n #else\n \t\t\tstr = cp;\n@@ -1665,6 +1679,9 @@ DecodePosixTimezone(char *str, int *tzp)\n *\tDTK_NUMBER can hold date fields (yy.ddd)\n *\tDTK_STRING can hold months (January) and time zones (PST)\n *\tDTK_DATE can hold Posix time zones (GMT-8)\n+ *\n+ * The \"lowstr\" work buffer must have at least strlen(timestr) + MAXDATEFIELDS\n+ * bytes of space. On output, field[] entries will point into it.\n */\n int\n ParseDateTime(char *timestr, char *lowstr,\n@@ -1677,7 +1694,10 @@ ParseDateTime(char *timestr, char *lowstr,\n \t/* outer loop through fields */\n \twhile (*(*endstr) != '\\0')\n \t{\n+\t\t/* Record start of current field */\n \t\tfield[nf] = lp;\n+\t\tif (nf >= MAXDATEFIELDS)\n+\t\t\treturn -1;\n \n \t\t/* leading digit? then date or time */\n \t\tif (isdigit((unsigned char) *(*endstr)))\n@@ -1818,8 +1838,6 @@ ParseDateTime(char *timestr, char *lowstr,\n \t\t/* force in a delimiter after each field */\n \t\t*lp++ = '\\0';\n \t\tnf++;\n-\t\tif (nf > MAXDATEFIELDS)\n-\t\t\treturn -1;\n \t}\n \n \t*numfields = nf;\ndiff --git a/src/interfaces/ecpg/pgtypeslib/interval.c b/src/interfaces/ecpg/pgtypeslib/interval.c\nindex 6d0926882e..d0dee16690 100644\n--- a/src/interfaces/ecpg/pgtypeslib/interval.c\n+++ b/src/interfaces/ecpg/pgtypeslib/interval.c\n@@ -1094,7 +1094,7 @@ PGTYPESinterval_from_asc(char *str, char **endptr)\n \ttm->tm_sec = 0;\n \tfsec = 0;\n \n-\tif (strlen(str) >= sizeof(lowstr))\n+\tif (strlen(str) > MAXDATELEN)\n \t{\n \t\terrno = PGTYPES_INTVL_BAD_INTERVAL;\n \t\treturn NULL;\ndiff --git a/src/interfaces/ecpg/pgtypeslib/timestamp.c b/src/interfaces/ecpg/pgtypeslib/timestamp.c\nindex a560af3c38..b0f9bf1521 100644\n--- a/src/interfaces/ecpg/pgtypeslib/timestamp.c\n+++ b/src/interfaces/ecpg/pgtypeslib/timestamp.c\n@@ -294,7 +294,7 @@ PGTYPEStimestamp_from_asc(char *str, char **endptr)\n \tchar\t *realptr;\n \tchar\t **ptr = (endptr != NULL) ? endptr : &realptr;\n \n-\tif (strlen(str) >= sizeof(lowstr))\n+\tif (strlen(str) > MAXDATELEN)\n \t{\n \t\terrno = PGTYPES_TS_BAD_TIMESTAMP;\n \t\treturn (noresult);\ndiff --git a/src/interfaces/ecpg/test/expected/pgtypeslib-dt_test2.c b/src/interfaces/ecpg/test/expected/pgtypeslib-dt_test2.c\nindex d3ebb0e106..0ba1936f1d 100644\n--- a/src/interfaces/ecpg/test/expected/pgtypeslib-dt_test2.c\n+++ b/src/interfaces/ecpg/test/expected/pgtypeslib-dt_test2.c\n@@ -45,6 +45,15 @@ char *dates[] = { \"19990108foobar\",\n \t\t\t\t \"1999.008\",\n \t\t\t\t \"J2451187\",\n \t\t\t\t \"January 8, 99 BC\",\n+\t\t\t\t /*\n+\t\t\t\t * Maximize space usage in ParseDateTime() with 25\n+\t\t\t\t * (MAXDATEFIELDS) fields and 128 (MAXDATELEN) total length.\n+\t\t\t\t */\n+\t\t\t\t \"........................Xaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"\n+\t\t\t\t \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\",\n+\t\t\t\t /* 26 fields */\n+\t\t\t\t \".........................aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"\n+\t\t\t\t \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\",\n \t\t\t\t NULL };\n \n /* do not conflict with libc \"times\" symbol */\n@@ -52,6 +61,7 @@ static char *times[] = { \"0:04\",\n \t\t\t\t \"1:59 PDT\",\n \t\t\t\t \"13:24:40 -8:00\",\n \t\t\t\t \"13:24:40.495+3\",\n+\t\t\t\t \"13:24:40.123456789+3\",\n \t\t\t\t NULL };\n \n char *intervals[] = { \"1 minute\",\n@@ -73,22 +83,22 @@ main(void)\n \t\t \n \t\t \n \t\n-#line 52 \"dt_test2.pgc\"\n+#line 62 \"dt_test2.pgc\"\n date date1 ;\n \n-#line 53 \"dt_test2.pgc\"\n+#line 63 \"dt_test2.pgc\"\n timestamp ts1 , ts2 ;\n \n-#line 54 \"dt_test2.pgc\"\n+#line 64 \"dt_test2.pgc\"\n char * text ;\n \n-#line 55 \"dt_test2.pgc\"\n+#line 65 \"dt_test2.pgc\"\n interval * i1 ;\n \n-#line 56 \"dt_test2.pgc\"\n+#line 66 \"dt_test2.pgc\"\n date * dc ;\n /* exec sql end declare section */\n-#line 57 \"dt_test2.pgc\"\n+#line 67 \"dt_test2.pgc\"\n \n \n \tint i, j;\ndiff --git a/src/interfaces/ecpg/test/expected/pgtypeslib-dt_test2.stdout b/src/interfaces/ecpg/test/expected/pgtypeslib-dt_test2.stdout\nindex 24e9d26dfe..9a4587b498 100644\n--- a/src/interfaces/ecpg/test/expected/pgtypeslib-dt_test2.stdout\n+++ b/src/interfaces/ecpg/test/expected/pgtypeslib-dt_test2.stdout\n@@ -8,85 +8,104 @@ TS[3,0]: 1999-01-08 00:04:00\n TS[3,1]: 1999-01-08 01:59:00\n TS[3,2]: 1999-01-08 13:24:40\n TS[3,3]: 1999-01-08 13:24:40.495\n+TS[3,4]: 1999-01-08 13:24:40.123456\n Date[4]: 1999-01-08 (N - F)\n TS[4,0]: 1999-01-08 00:04:00\n TS[4,1]: 1999-01-08 01:59:00\n TS[4,2]: 1999-01-08 13:24:40\n TS[4,3]: 1999-01-08 13:24:40.495\n+TS[4,4]: 1999-01-08 13:24:40.123456\n Date[5]: 1999-01-08 (N - F)\n TS[5,0]: 1999-01-08 00:04:00\n TS[5,1]: 1999-01-08 01:59:00\n TS[5,2]: 1999-01-08 13:24:40\n TS[5,3]: 1999-01-08 13:24:40.495\n+TS[5,4]: 1999-01-08 13:24:40.123456\n Date[6]: 1999-01-18 (N - F)\n TS[6,0]: 1999-01-18 00:04:00\n TS[6,1]: 1999-01-18 01:59:00\n TS[6,2]: 1999-01-18 13:24:40\n TS[6,3]: 1999-01-18 13:24:40.495\n+TS[6,4]: 1999-01-18 13:24:40.123456\n Date[7]: 2003-01-02 (N - F)\n TS[7,0]: 2003-01-02 00:04:00\n TS[7,1]: 2003-01-02 01:59:00\n TS[7,2]: 2003-01-02 13:24:40\n TS[7,3]: 2003-01-02 13:24:40.495\n+TS[7,4]: 2003-01-02 13:24:40.123456\n Date[8]: 1999-01-08 (N - F)\n TS[8,0]: 1999-01-08 00:04:00\n TS[8,1]: 1999-01-08 01:59:00\n TS[8,2]: 1999-01-08 13:24:40\n TS[8,3]: 1999-01-08 13:24:40.495\n+TS[8,4]: 1999-01-08 13:24:40.123456\n Date[9]: 1999-01-08 (N - F)\n TS[9,0]: 1999-01-08 00:04:00\n TS[9,1]: 1999-01-08 01:59:00\n TS[9,2]: 1999-01-08 13:24:40\n TS[9,3]: 1999-01-08 13:24:40.495\n+TS[9,4]: 1999-01-08 13:24:40.123456\n Date[10]: 1999-01-08 (N - F)\n TS[10,0]: 1999-01-08 00:04:00\n TS[10,1]: 1999-01-08 01:59:00\n TS[10,2]: 1999-01-08 13:24:40\n TS[10,3]: 1999-01-08 13:24:40.495\n+TS[10,4]: 1999-01-08 13:24:40.123456\n Date[11]: 1999-01-08 (N - F)\n TS[11,0]: 1999-01-08 00:04:00\n TS[11,1]: 1999-01-08 01:59:00\n TS[11,2]: 1999-01-08 13:24:40\n TS[11,3]: 1999-01-08 13:24:40.495\n+TS[11,4]: 1999-01-08 13:24:40.123456\n Date[12]: 1999-01-08 (N - F)\n TS[12,0]: 1999-01-08 00:04:00\n TS[12,1]: 1999-01-08 01:59:00\n TS[12,2]: 1999-01-08 13:24:40\n TS[12,3]: 1999-01-08 13:24:40.495\n+TS[12,4]: 1999-01-08 13:24:40.123456\n Date[13]: 2006-01-08 (N - F)\n TS[13,0]: 2006-01-08 00:04:00\n TS[13,1]: 2006-01-08 01:59:00\n TS[13,2]: 2006-01-08 13:24:40\n TS[13,3]: 2006-01-08 13:24:40.495\n+TS[13,4]: 2006-01-08 13:24:40.123456\n Date[14]: 1999-01-08 (N - F)\n TS[14,0]: 1999-01-08 00:04:00\n TS[14,1]: 1999-01-08 01:59:00\n TS[14,2]: 1999-01-08 13:24:40\n TS[14,3]: 1999-01-08 13:24:40.495\n+TS[14,4]: 1999-01-08 13:24:40.123456\n Date[15]: 1999-01-08 (N - F)\n TS[15,0]: 1999-01-08 00:04:00\n TS[15,1]: 1999-01-08 01:59:00\n TS[15,2]: 1999-01-08 13:24:40\n TS[15,3]: 1999-01-08 13:24:40.495\n+TS[15,4]: 1999-01-08 13:24:40.123456\n Date[16]: 1999-01-08 (N - F)\n TS[16,0]: 1999-01-08 00:04:00\n TS[16,1]: 1999-01-08 01:59:00\n TS[16,2]: 1999-01-08 13:24:40\n TS[16,3]: 1999-01-08 13:24:40.495\n+TS[16,4]: 1999-01-08 13:24:40.123456\n Date[17]: 1999-01-08 (N - F)\n TS[17,0]: 1999-01-08 00:04:00\n TS[17,1]: 1999-01-08 01:59:00\n TS[17,2]: 1999-01-08 13:24:40\n TS[17,3]: 1999-01-08 13:24:40.495\n+TS[17,4]: 1999-01-08 13:24:40.123456\n Date[18]: 1999-01-08 (N - F)\n TS[18,0]: 1999-01-08 00:04:00\n TS[18,1]: 1999-01-08 01:59:00\n TS[18,2]: 1999-01-08 13:24:40\n TS[18,3]: 1999-01-08 13:24:40.495\n+TS[18,4]: 1999-01-08 13:24:40.123456\n Date[19]: 0099-01-08 BC (N - F)\n TS[19,0]: 0099-01-08 00:04:00 BC\n TS[19,1]: 0099-01-08 01:59:00 BC\n TS[19,2]: 0099-01-08 13:24:40 BC\n+TS[19,4]: 0099-01-08 13:24:40.123456 BC\n+Date[20]: - (N - T)\n+Date[21]: - (N - T)\n interval[0]: @ 1 min\n interval_copy[0]: @ 1 min\n interval[1]: @ 1 day 12 hours 59 mins 10 secs\ndiff --git a/src/interfaces/ecpg/test/pgtypeslib/dt_test2.pgc b/src/interfaces/ecpg/test/pgtypeslib/dt_test2.pgc\nindex 0edf012fd1..a127dd93a6 100644\n--- a/src/interfaces/ecpg/test/pgtypeslib/dt_test2.pgc\n+++ b/src/interfaces/ecpg/test/pgtypeslib/dt_test2.pgc\n@@ -27,6 +27,15 @@ char *dates[] = { \"19990108foobar\",\n \t\t\t\t \"1999.008\",\n \t\t\t\t \"J2451187\",\n \t\t\t\t \"January 8, 99 BC\",\n+\t\t\t\t /*\n+\t\t\t\t * Maximize space usage in ParseDateTime() with 25\n+\t\t\t\t * (MAXDATEFIELDS) fields and 128 (MAXDATELEN) total length.\n+\t\t\t\t */\n+\t\t\t\t \"........................Xaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"\n+\t\t\t\t \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\",\n+\t\t\t\t /* 26 fields */\n+\t\t\t\t \".........................aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"\n+\t\t\t\t \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\",\n \t\t\t\t NULL };\n \n /* do not conflict with libc \"times\" symbol */\n@@ -34,6 +43,7 @@ static char *times[] = { \"0:04\",\n \t\t\t\t \"1:59 PDT\",\n \t\t\t\t \"13:24:40 -8:00\",\n \t\t\t\t \"13:24:40.495+3\",\n+\t\t\t\t \"13:24:40.123456789+3\",\n \t\t\t\t NULL };\n \n char *intervals[] = { \"1 minute\",\ndiff --git a/src/test/regress/expected/interval.out b/src/test/regress/expected/interval.out\nindex 3bf221187b..99fd0ca490 100644\n--- a/src/test/regress/expected/interval.out\n+++ b/src/test/regress/expected/interval.out\n@@ -306,6 +306,13 @@ select '4 millenniums 5 centuries 4 decades 1 year 4 months 4 days 17 minutes 31\n @ 4541 years 4 mons 4 days 17 mins 31 secs\n (1 row)\n \n+-- test long interval output\n+select '100000000y 10mon -1000000000d -1000000000h -10min -10.000001s ago'::interval;\n+ interval \n+-------------------------------------------------------------------------------------------\n+ @ 100000000 years 10 mons -1000000000 days -1000000000 hours -10 mins -10.000001 secs ago\n+(1 row)\n+\n -- test justify_hours() and justify_days()\n SELECT justify_hours(interval '6 months 3 days 52 hours 3 minutes 2 seconds') as \"6 mons 5 days 4 hours 3 mins 2 seconds\";\n 6 mons 5 days 4 hours 3 mins 2 seconds \ndiff --git a/src/test/regress/sql/interval.sql b/src/test/regress/sql/interval.sql\nindex f1da4c2911..7cee2864de 100644\n--- a/src/test/regress/sql/interval.sql\n+++ b/src/test/regress/sql/interval.sql\n@@ -108,6 +108,8 @@ select avg(f1) from interval_tbl;\n -- test long interval input\n select '4 millenniums 5 centuries 4 decades 1 year 4 months 4 days 17 minutes 31 seconds'::interval;\n \n+-- test long interval output\n+select '100000000y 10mon -1000000000d -1000000000h -10min -10.000001s ago'::interval;\n \n -- test justify_hours() and justify_days()\n \n-- \n2.30.2\n",
8
+ "Many server functions use the MAXDATELEN constant to size a buffer for\nparsing or displaying a datetime value. It was much too small for the\nlongest possible interval output and slightly too small for certain\nvalid timestamp input, particularly input with a long timezone name.\nThe long input was rejected needlessly; the long output caused\ninterval_out() to overrun its buffer. ECPG's pgtypes library has a copy\nof the vulnerable functions, which bore the same vulnerabilities along\nwith some of its own. In contrast to the server, certain long inputs\ncaused stack overflow rather than failing cleanly. Back-patch to 8.4\n(all supported versions).\n\nReported by Daniel SchAssler, reviewed by Tom Lane.\n\nSecurity: CVE-2014-0063"
9
+ ],
10
+ [
11
+ "diff --git a/src/lxc/lxclock.c b/src/lxc/lxclock.c\nindex fe13898df9..e9e95f7a01 100644\n--- a/src/lxc/lxclock.c\n+++ b/src/lxc/lxclock.c\n@@ -103,13 +103,13 @@ static char *lxclock_name(const char *p, const char *n)\n \tchar *rundir;\n \n \t/* lockfile will be:\n-\t * \"/run\" + \"/lock/lxc/$lxcpath/$lxcname + '\\0' if root\n+\t * \"/run\" + \"/lxc/lock/$lxcpath/$lxcname + '\\0' if root\n \t * or\n-\t * $XDG_RUNTIME_DIR + \"/lock/lxc/$lxcpath/$lxcname + '\\0' if non-root\n+\t * $XDG_RUNTIME_DIR + \"/lxc/lock/$lxcpath/$lxcname + '\\0' if non-root\n \t */\n \n-\t/* length of \"/lock/lxc/\" + $lxcpath + \"/\" + \".\" + $lxcname + '\\0' */\n-\tlen = strlen(\"/lock/lxc/\") + strlen(n) + strlen(p) + 3;\n+\t/* length of \"/lxc/lock/\" + $lxcpath + \"/\" + \".\" + $lxcname + '\\0' */\n+\tlen = strlen(\"/lxc/lock/\") + strlen(n) + strlen(p) + 3;\n \trundir = get_rundir();\n \tif (!rundir)\n \t\treturn NULL;\n@@ -120,7 +120,7 @@ static char *lxclock_name(const char *p, const char *n)\n \t\treturn NULL;\n \t}\n \n-\tret = snprintf(dest, len, \"%s/lock/lxc/%s\", rundir, p);\n+\tret = snprintf(dest, len, \"%s/lxc/lock/%s\", rundir, p);\n \tif (ret < 0 || ret >= len) {\n \t\tfree(dest);\n \t\tfree(rundir);\n@@ -128,40 +128,13 @@ static char *lxclock_name(const char *p, const char *n)\n \t}\n \tret = mkdir_p(dest, 0755);\n \tif (ret < 0) {\n-\t\t/* fall back to \"/tmp/\" + $(id -u) + \"/lxc\" + $lxcpath + \"/\" + \".\" + $lxcname + '\\0'\n-\t\t * * maximum length of $(id -u) is 10 calculated by (log (2 ** (sizeof(uid_t) * 8) - 1) / log 10 + 1)\n-\t\t * * lxcpath always starts with '/'\n-\t\t */\n-\t\tint l2 = 22 + strlen(n) + strlen(p);\n-\t\tif (l2 > len) {\n-\t\t\tchar *d;\n-\t\t\td = realloc(dest, l2);\n-\t\t\tif (!d) {\n-\t\t\t\tfree(dest);\n-\t\t\t\tfree(rundir);\n-\t\t\t\treturn NULL;\n-\t\t\t}\n-\t\t\tlen = l2;\n-\t\t\tdest = d;\n-\t\t}\n-\t\tret = snprintf(dest, len, \"/tmp/%d/lxc%s\", geteuid(), p);\n-\t\tif (ret < 0 || ret >= len) {\n-\t\t\tfree(dest);\n-\t\t\tfree(rundir);\n-\t\t\treturn NULL;\n-\t\t}\n-\t\tret = mkdir_p(dest, 0755);\n-\t\tif (ret < 0) {\n-\t\t\tfree(dest);\n-\t\t\tfree(rundir);\n-\t\t\treturn NULL;\n-\t\t}\n-\t\tret = snprintf(dest, len, \"/tmp/%d/lxc%s/.%s\", geteuid(), p, n);\n-\t} else\n-\t\tret = snprintf(dest, len, \"%s/lock/lxc/%s/.%s\", rundir, p, n);\n+\t\tfree(dest);\n+\t\tfree(rundir);\n+\t\treturn NULL;\n+\t}\n \n+\tret = snprintf(dest, len, \"%s/lxc/lock/%s/.%s\", rundir, p, n);\n \tfree(rundir);\n-\n \tif (ret < 0 || ret >= len) {\n \t\tfree(dest);\n \t\treturn NULL;\ndiff --git a/src/tests/locktests.c b/src/tests/locktests.c\nindex dd3393a893..233ca127c6 100644\n--- a/src/tests/locktests.c\n+++ b/src/tests/locktests.c\n@@ -122,7 +122,7 @@ int main(int argc, char *argv[])\n \t\texit(1);\n \t}\n \tstruct stat sb;\n-\tchar *pathname = RUNTIME_PATH \"/lock/lxc/var/lib/lxc/\";\n+\tchar *pathname = RUNTIME_PATH \"/lxc/lock/var/lib/lxc/\";\n \tret = stat(pathname, &sb);\n \tif (ret != 0) {\n \t\tfprintf(stderr, \"%d: filename %s not created\\n\", __LINE__,\n",
12
+ "lxclock: use /run/lxc/lock rather than /run/lock/lxc\nThis prevents an unprivileged user to use LXC to create arbitrary file\non the filesystem.\n\nSigned-off-by: Serge Hallyn <[email protected]>\nSigned-off-by: Tyler Hicks <[email protected]>\nAcked-by: Stephane Graber <[email protected]>"
13
+ ]
14
+ ]
backend/inference.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from backend.section_infer_helper.local_llm_helper import local_llm_helper
2
+ from backend.section_infer_helper.online_llm_helper import online_llm_helper
3
+ from backend.cwe_infer_helper import cwe_infer_helper
4
+
5
+ MODEL_TYPE_HELPER_MAP = {
6
+ "Local LLM": local_llm_helper,
7
+ "Online LLM": online_llm_helper
8
+ }
9
+
10
+ PREDEF_MODEL_MAP = {
11
+ "Local LLM": local_llm_helper.PREDEF_MODEL,
12
+ "Online LLM": online_llm_helper.PREDEF_MODEL
13
+ }
14
+
15
+ LOCAL_MODEL_PEFT_MAP = local_llm_helper.MODEL_PEFT_MAP
16
+
17
+ PREDEF_CWE_MODEL = cwe_infer_helper.PREDEF_MODEL
18
+
19
+ def section_infer(diff_code, patch_message, model_type, *model_config):
20
+ helper = MODEL_TYPE_HELPER_MAP.get(model_type, None)
21
+ if helper is None:
22
+ raise ValueError(f"Model {model_type} is not supported")
23
+ helper.load_model(*model_config)
24
+ results = helper.infer(diff_code, patch_message)
25
+ return results
26
+
27
+
28
+ def cwe_infer(diff_code, patch_message, *model_config):
29
+ helper = cwe_infer_helper
30
+ helper.load_model(*model_config)
31
+ results = helper.infer(diff_code, patch_message)
32
+ return results
backend/model/PEFT/patchouli-qwc2.5-0.5b/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2.5-Coder-0.5B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "down_proj",
24
+ "gate_proj",
25
+ "o_proj",
26
+ "v_proj",
27
+ "up_proj",
28
+ "q_proj",
29
+ "k_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
backend/model/PEFT/patchouli-qwc2.5-0.5b/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7b89f36b9e9ab81e75f2b2f0156e771254f09e6a2308b6c90cad9af7d93b02a
3
+ size 8841928
backend/model/PEFT/patchouli-qwc2.5-0.5b/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
backend/model/PEFT/patchouli-qwc2.5-0.5b/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
backend/model/PEFT/patchouli-qwc2.5-0.5b/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
backend/model/PEFT/patchouli-qwc2.5-0.5b/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
backend/model/PEFT/patchouli-qwc2.5-0.5b/tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "model_max_length": 32768,
203
+ "pad_token": "<|endoftext|>",
204
+ "padding_side": "right",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
backend/model/PEFT/patchouli-qwc2.5-0.5b/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
backend/model/cwe-cls/patchouli-unixcoder/config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/unixcoder-base-nine",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "CWE-79",
16
+ "1": "CWE-787",
17
+ "2": "CWE-89",
18
+ "3": "CWE-352",
19
+ "4": "CWE-22",
20
+ "5": "CWE-125",
21
+ "6": "CWE-78",
22
+ "7": "CWE-416",
23
+ "8": "CWE-862",
24
+ "9": "CWE-434",
25
+ "10": "CWE-94",
26
+ "11": "CWE-20",
27
+ "12": "CWE-77",
28
+ "13": "CWE-287",
29
+ "14": "CWE-269",
30
+ "15": "CWE-502",
31
+ "16": "CWE-200",
32
+ "17": "CWE-863",
33
+ "18": "CWE-918",
34
+ "19": "CWE-119",
35
+ "20": "CWE-476",
36
+ "21": "CWE-798",
37
+ "22": "CWE-190",
38
+ "23": "CWE-400",
39
+ "24": "CWE-306"
40
+ },
41
+ "initializer_range": 0.02,
42
+ "intermediate_size": 3072,
43
+ "layer_norm_eps": 1e-05,
44
+ "max_position_embeddings": 1026,
45
+ "model_type": "roberta",
46
+ "num_attention_heads": 12,
47
+ "num_hidden_layers": 12,
48
+ "output_past": true,
49
+ "pad_token_id": 1,
50
+ "position_embedding_type": "absolute",
51
+ "problem_type": "single_label_classification",
52
+ "torch_dtype": "float32",
53
+ "transformers_version": "4.47.1",
54
+ "type_vocab_size": 10,
55
+ "use_cache": true,
56
+ "vocab_size": 51416
57
+ }
backend/model/cwe-cls/patchouli-unixcoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63620e0ba21c9d79f01b0ca6aa1c32c09119da66e3e5fed7f6481e1e61a0d103
3
+ size 503819956
backend/section_infer_helper/base_helper.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from backend.utils.data_process import LANGUAGE_EXT_MAP
2
+
3
+ from abc import ABC, abstractmethod
4
+
5
+ class BaseHelper(ABC):
6
+
7
+ def _get_lang_ext(language_list):
8
+ ext_list = []
9
+ for lang in language_list:
10
+ ext_list.extend(LANGUAGE_EXT_MAP.get(lang, []))
11
+ return ext_list
12
+
13
+
14
+ def _get_lang_by_ext(ext):
15
+ for lang, ext_list in LANGUAGE_EXT_MAP.items():
16
+ if ext in ext_list:
17
+ return lang
18
+ return None
19
+
20
+
21
+ class InputData():
22
+ def __init__(self, filename, patch, section, patch_msg):
23
+ self.filename = filename
24
+ self.patch = patch
25
+ self.section = section
26
+ self.patch_msg = patch_msg
27
+
28
+
29
+ @abstractmethod
30
+ def load_model(self, *args, **kwargs):
31
+ raise NotImplementedError()
32
+
33
+
34
+ @abstractmethod
35
+ def infer(self, diff_code, message = None, batch_size = 1):
36
+ '''
37
+ Result format:
38
+ [
39
+ file: [
40
+ {
41
+ "section": section,
42
+ "predict": 1/0,
43
+ "conf": conf
44
+ },
45
+ ...
46
+ ],
47
+ ...
48
+ ]
49
+ '''
50
+ raise NotImplementedError()
51
+
backend/section_infer_helper/local_llm_helper.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import torch
3
+ from torch.nn.functional import softmax
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM
5
+ from peft import PeftModel
6
+ from tqdm import tqdm
7
+ from collections import defaultdict
8
+
9
+ from backend.section_infer_helper.base_helper import BaseHelper
10
+ from backend.utils.data_process import split_to_file_diff, split_to_section
11
+
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class LocalLLMHelper(BaseHelper):
17
+
18
+ MAX_LENGTH = 4096
19
+ MAX_NEW_TOKEN = 16
20
+ BATCH_SIZE = 4
21
+
22
+ SYSTEM_PROMPT = "You are now an expert in code vulnerability and patch fixes."
23
+
24
+ def generate_instruction(language, file_name, patch, section, message = None):
25
+ instruction = "[TASK]\nHere is a patch in {} language and a section of this patch for a source code file with path {}. Determine if the patch section fixes any software vulnerabilities. Output 'yes' or 'no' and do not output any other text.\n".format(language, file_name)
26
+ instruction += "[Patch]\n{}\n".format(patch)
27
+ instruction += "[A section of this patch]\n{}\n".format(section)
28
+ if message is not None and message != "":
29
+ instruction += "[Message of the Patch]\n{}\n".format(message)
30
+
31
+ return instruction
32
+
33
+ MODEL_CONFIGS = defaultdict(lambda: {
34
+ "supported_languages": ["C", "C++", "Java", "Python"],
35
+ })
36
+
37
+ MODEL_CONFIGS.update({
38
+ ("Qwen/Qwen2.5-Coder-0.5B-Instruct", "backend/model/PEFT/patchouli-qwc2.5-0.5b"): {
39
+ "supported_languages": ["C", "C++", "Java", "Python"],
40
+ },
41
+ ("Qwen/Qwen2.5-Coder-0.5B-Instruct", None): {
42
+ "supported_languages": ["C", "C++", "Java", "Python"],
43
+ },
44
+ ("Qwen/Qwen2.5-Coder-7B-Instruct", None): {
45
+ "supported_languages": ["C", "C++", "Java", "Python"],
46
+ },
47
+ ("deepseek-ai/deepseek-coder-7b-instruct-v1.5", None): {
48
+ "supported_languages": ["C", "C++", "Java", "Python"],
49
+ },
50
+ ("codellama/CodeLlama-7b-Instruct-hf", None): {
51
+ "supported_languages": ["C", "C++", "Java", "Python"],
52
+ },
53
+ })
54
+
55
+ PREDEF_MODEL = []
56
+ for model, peft in MODEL_CONFIGS.keys():
57
+ if model not in PREDEF_MODEL:
58
+ PREDEF_MODEL.append(model)
59
+ MODEL_PEFT_MAP = defaultdict(lambda: [None])
60
+ for model, peft in MODEL_CONFIGS.keys():
61
+ if peft is not None:
62
+ MODEL_PEFT_MAP[model].append(peft)
63
+
64
+
65
+ def __init__(self):
66
+ self.model = None
67
+ self.tokenizer = None
68
+ self.model_name_or_path = None
69
+ self.peft_name_or_path = None
70
+
71
+
72
+ def __del__(self):
73
+ if self.model is not None:
74
+ self.release_model()
75
+
76
+
77
+ def infer(self, diff_code, message = None, batch_size = BATCH_SIZE):
78
+ if self.model is None:
79
+ raise RuntimeError("Model is not loaded")
80
+
81
+ results = {}
82
+ input_list = []
83
+ file_diff_list = split_to_file_diff(diff_code, BaseHelper._get_lang_ext(LocalLLMHelper.MODEL_CONFIGS[self.model_name_or_path]["supported_languages"]))
84
+ for file_a, _, file_diff in file_diff_list:
85
+ sections = split_to_section(file_diff)
86
+ file_name = file_a.removeprefix("a/")
87
+ results[file_name] = []
88
+ for section in sections:
89
+ input_list.append(BaseHelper.InputData(file_name, section, section, message))
90
+
91
+ input_prompt, output_text, output_prob = self.do_infer(input_list, batch_size)
92
+ assert len(input_list) == len(input_prompt) == len(output_text) == len(output_prob)
93
+ for i in range(len(input_list)):
94
+ file_name = input_list[i].filename
95
+ section = input_list[i].section
96
+ output_text_i = output_text[i].lower()
97
+ output_prob_i = output_prob[i]
98
+ results[file_name].append({
99
+ "section": section,
100
+ "predict": 1 if "yes" in output_text_i else 0,
101
+ "conf": output_prob_i
102
+ })
103
+
104
+ return results
105
+
106
+
107
+ def load_model(self, model_name_or_path, peft_name_or_path = None):
108
+ if model_name_or_path == self.model_name_or_path and peft_name_or_path == self.peft_name_or_path:
109
+ return
110
+ logger.info(f"Loading model {model_name_or_path}")
111
+
112
+ if self.model is not None:
113
+ self.release_model()
114
+ self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype=torch.float32, device_map="auto")
115
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side="left")
116
+ if peft_name_or_path is not None and peft_name_or_path != "" and peft_name_or_path != "None":
117
+ logger.info(f"Loading PEFT model {peft_name_or_path}")
118
+ self.model = PeftModel.from_pretrained(self.model, peft_name_or_path)
119
+ self.tokenizer = AutoTokenizer.from_pretrained(peft_name_or_path, padding_side="left")
120
+ if self.tokenizer.pad_token_id is None:
121
+ self.tokenizer.pad_token = self.tokenizer.eos_token
122
+
123
+ self.model.eval()
124
+
125
+ self.model_name_or_path = model_name_or_path
126
+ self.peft_name_or_path = peft_name_or_path
127
+ logger.info(f"Model loaded")
128
+
129
+
130
+ def generate_message(filename, patch, section, patch_message = None):
131
+ ext = filename.split(".")[-1]
132
+ language = BaseHelper._get_lang_by_ext(ext)
133
+ messages = [
134
+ {
135
+ "role": "system",
136
+ "content": LocalLLMHelper.SYSTEM_PROMPT
137
+ },
138
+ {
139
+ "role": "user",
140
+ "content": LocalLLMHelper.generate_instruction(language, filename, patch, section, patch_message)
141
+ }
142
+ ]
143
+ return messages
144
+
145
+
146
+ def release_model(self):
147
+ del self.model
148
+ del self.tokenizer
149
+ self.model = None
150
+ self.tokenizer = None
151
+ torch.cuda.empty_cache()
152
+ logger.info(f"Model {self.model_name_or_path} released")
153
+ self.model_name_or_path = None
154
+
155
+
156
+ def do_infer(self, input_list, batch_size = BATCH_SIZE):
157
+ if type(input_list) is not list:
158
+ input_list = [input_list]
159
+
160
+ input_data_batches = [input_list[i:i+batch_size] for i in range(0, len(input_list), batch_size)]
161
+ input_ids_list = []
162
+ if len(input_list) > 0:
163
+ logger.info("Example input prompt")
164
+ logger.info(LocalLLMHelper.generate_message(input_list[0].filename, input_list[0].patch, input_list[0].section, input_list[0].patch_msg))
165
+
166
+ for batch in tqdm(input_data_batches, desc="Tokenizing", unit="batch", total=len(input_data_batches)):
167
+ message_list = []
168
+ for input_data in batch:
169
+ message_list.append(LocalLLMHelper.generate_message(input_data.filename, input_data.patch, input_data.section, input_data.patch_msg))
170
+ input_ids_batch = self.tokenizer.apply_chat_template(
171
+ message_list,
172
+ add_generation_prompt=True,
173
+ return_tensors="pt",
174
+ max_length=LocalLLMHelper.MAX_LENGTH,
175
+ truncation=True,
176
+ padding=True)
177
+ input_ids_list.append(input_ids_batch)
178
+
179
+ input_prompt = []
180
+ output_text = []
181
+ output_prob = []
182
+
183
+ for input_ids in tqdm(input_ids_list, desc="Generating", unit="batch", total=len(input_ids_list)):
184
+ input_ids = input_ids.to(self.model.device)
185
+ outputs = self.model.generate(input_ids, max_new_tokens=LocalLLMHelper.MAX_NEW_TOKEN,
186
+ eos_token_id=self.tokenizer.eos_token_id, pad_token_id=self.tokenizer.pad_token_id,
187
+ output_logits=True, return_dict_in_generate=True)
188
+
189
+ input_prompt.extend(self.tokenizer.batch_decode(input_ids, skip_special_tokens=True))
190
+ output_text.extend(self.tokenizer.batch_decode(outputs.sequences[:, len(input_ids[0]):], skip_special_tokens=True))
191
+ batch_output_prob = softmax(outputs.logits[0], dim=-1).max(dim=-1).values
192
+ output_prob.extend([float(p) for p in batch_output_prob])
193
+
194
+ return input_prompt, output_text, output_prob
195
+
196
+
197
+ local_llm_helper = LocalLLMHelper()
backend/section_infer_helper/online_llm_helper.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from openai import OpenAI
3
+ from tqdm import tqdm
4
+ from collections import defaultdict
5
+ import traceback
6
+ import httpx
7
+
8
+ from backend.utils.data_process import split_to_file_diff, split_to_section
9
+ from backend.section_infer_helper.base_helper import BaseHelper
10
+
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class OnlineLLMHelper(BaseHelper):
16
+
17
+ MAX_LENGTH = 4096
18
+ MAX_NEW_TOKENS = 16
19
+
20
+ PREDEF_MODEL = ["gpt-3.5-turbo", "deepseek-chat", "qwen-coder-plus", "gpt-4-turbo", "gpt-4o", "gemini-1.5-pro-latest", "claude-3-5-sonnet-20241022"]
21
+
22
+ MODEL_CONFIGS = defaultdict(lambda: {
23
+ "supported_languages": ["C", "C++", "Java", "Python"],
24
+ })
25
+
26
+ SYSTEM_PROMPT = "You are an expert in code vulnerability and patch fixes."
27
+
28
+ def generate_instruction(language, file_name, patch, section, message = None):
29
+ instruction = "[TASK]\nHere is a patch in {} language and a section of this patch for a source code file with path {}. Determine if the patch section fixes any software vulnerabilities. Output 'yes' or 'no' and do not output any other text.\n".format(language, file_name)
30
+ instruction += "[Patch]\n{}\n".format(patch)
31
+ instruction += "[A section of this patch]\n{}\n".format(section)
32
+ if message is not None and message != "":
33
+ instruction += "[Message of the Patch]\n{}\n".format(message)
34
+
35
+ return instruction
36
+
37
+
38
+ def __init__(self):
39
+ self.model_name = None
40
+ self.url = None
41
+ self.key = None
42
+
43
+
44
+ def generate_message(filename, patch, section, patch_message = None):
45
+ ext = filename.split(".")[-1]
46
+ language = BaseHelper._get_lang_by_ext(ext)
47
+ user_message = OnlineLLMHelper.generate_instruction(language, filename, patch, section, patch_message)
48
+ user_message = user_message.split(" ")
49
+ user_message = user_message[:OnlineLLMHelper.MAX_LENGTH]
50
+ user_message = " ".join(user_message)
51
+ messages = [
52
+ {
53
+ "role": "system",
54
+ "content": OnlineLLMHelper.SYSTEM_PROMPT
55
+ },
56
+ {
57
+ "role": "user",
58
+ "content": user_message
59
+ }
60
+ ]
61
+ return messages
62
+
63
+
64
+ def load_model(self, model_name, url, api_key):
65
+ self.model_name = model_name
66
+
67
+ self.openai_client = OpenAI(
68
+ base_url = url,
69
+ api_key = api_key,
70
+ timeout=httpx.Timeout(15.0)
71
+ )
72
+
73
+
74
+ def infer(self, diff_code, message = None, batch_size=1):
75
+ if self.model_name is None:
76
+ raise RuntimeError("Model is not loaded")
77
+
78
+ results = {}
79
+ input_list = []
80
+ file_diff_list = split_to_file_diff(diff_code, BaseHelper._get_lang_ext(OnlineLLMHelper.MODEL_CONFIGS[self.model_name]["supported_languages"]))
81
+ for file_a, _, file_diff in file_diff_list:
82
+ sections = split_to_section(file_diff)
83
+ file_name = file_a.removeprefix("a/")
84
+ results[file_name] = []
85
+ for section in sections:
86
+ input_list.append(BaseHelper.InputData(file_name, section, section, message))
87
+
88
+ input_prompt, output_text, output_prob = self.do_infer(input_list, batch_size)
89
+ assert len(input_list) == len(input_prompt) == len(output_text) == len(output_prob)
90
+ for i in range(len(input_list)):
91
+ file_name = input_list[i].filename
92
+ section = input_list[i].section
93
+ output_text_i = output_text[i].lower()
94
+ output_prob_i = output_prob[i]
95
+ results[file_name].append({
96
+ "section": section,
97
+ "predict": -1 if output_text_i == "error" else 1 if "yes" in output_text_i else 0,
98
+ "conf": output_prob_i
99
+ })
100
+
101
+ return results
102
+
103
+
104
+ def do_infer(self, input_list, batch_size = 1):
105
+ input_prompt = []
106
+ for input_data in input_list:
107
+ input_prompt.append(OnlineLLMHelper.generate_message(input_data.filename, input_data.patch, input_data.section, input_data.patch_msg))
108
+
109
+ if len(input_prompt) > 0:
110
+ logger.info("Example input prompt: %s", input_prompt[0])
111
+ output_text = []
112
+ for prompt, input_data in tqdm(zip(input_prompt, input_list), desc="Inferencing", unit = "section", total = len(input_prompt)):
113
+ try:
114
+ response = self.openai_client.chat.completions.create(
115
+ messages = prompt,
116
+ model = self.model_name,
117
+ max_completion_tokens = OnlineLLMHelper.MAX_NEW_TOKENS
118
+ )
119
+ output_text.append(response.choices[0].message.content)
120
+ except KeyboardInterrupt:
121
+ logging.error("KeyboardInterrupted")
122
+ break
123
+ except Exception as e:
124
+ logger.error(f"Error: {e}")
125
+ logger.error(f"Error inferencing: {input_data.filename} - {input_data.section}")
126
+ logger.error(traceback.format_exc())
127
+
128
+ output_text.append("error")
129
+ continue
130
+
131
+ # break
132
+
133
+ output_prob = [1.0] * len(output_text)
134
+ return input_prompt, output_text, output_prob
135
+
136
+
137
+ online_llm_helper = OnlineLLMHelper()
backend/section_infer_helper/random_helper.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DEBUG ONLY
2
+ import time
3
+ import random
4
+ from tqdm import tqdm
5
+
6
+ from backend.section_infer_helper.base_helper import BaseHelper
7
+ from backend.utils.data_process import split_to_file_diff, split_to_section
8
+
9
+ class RandomHelper(BaseHelper):
10
+
11
+ PREDEF_MODEL = ["Random"]
12
+
13
+ MODELS_SUPPORTED_LANGUAGES = {
14
+ "Random": ["C", "C++", "Java", "Python"]
15
+ }
16
+
17
+
18
+ def load_model(self, model_name):
19
+ pass
20
+
21
+
22
+ def infer(self, diff_code):
23
+ file_diff_list = split_to_file_diff(diff_code, BaseHelper._get_lang_ext(self.MODELS_SUPPORTED_LANGUAGES["Random"]))
24
+ results = {}
25
+ for file_a, _, file_diff in tqdm(file_diff_list, desc="Inferencing", unit="file", total=len(file_diff_list)):
26
+ time.sleep(0.1)
27
+ sections = split_to_section(file_diff)
28
+ file = file_a.removeprefix("a/")
29
+ results[file] = []
30
+ for section in sections:
31
+ results[file].append({
32
+ "section": section,
33
+ "predict": random.choice([0, 1]),
34
+ "conf": random.random()
35
+ })
36
+ return results
37
+
38
+
39
+ random_helper = RandomHelper()
backend/utils/data_process.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, re
2
+
3
+
4
+ LANGUAGE_EXT_MAP = {
5
+ "C": ['c', 'h', 'cc'],
6
+ "C++": ['cpp', 'hpp', 'cxx', 'hxx', 'c++', 'h++'],
7
+ "Java": ['java'],
8
+ "Python": ['py', 'pyx']
9
+ }
10
+
11
+
12
+ # 拆分一个commit下的diff,拆成多个文件的变更
13
+ def split_to_file_diff(diff_code : str, preserve_ext_list : list = []) -> list:
14
+ # 定义正则表达式匹配 diff --git 行,并提取文件路径
15
+ pattern = re.compile(r"diff --git (\S+) (\S+)") # 匹配 diff --git 行
16
+ files = []
17
+ current_diff_content = []
18
+ current_file_a, current_file_b = None, None
19
+ preserve_ext_list = [f".{ext}" for ext in preserve_ext_list]
20
+
21
+ # 遍历每行 diff 数据
22
+ for line in diff_code.splitlines():
23
+ match = pattern.match(line)
24
+ if match:
25
+ # 如果是新的 diff --git 行,处理前一个 diff
26
+ if current_file_a and current_file_b:
27
+ # 获取文件扩展名
28
+ ext_a = os.path.splitext(current_file_a)[1]
29
+ ext_b = os.path.splitext(current_file_b)[1]
30
+ # 只保留指定扩展名的文件
31
+ if len(preserve_ext_list) == 0 or ext_a in preserve_ext_list and ext_b in preserve_ext_list:
32
+ files.append((current_file_a, current_file_b, '\n'.join(current_diff_content)))
33
+
34
+ # 更新当前文件路径
35
+ current_file_a = match.group(1)
36
+ current_file_b = match.group(2)
37
+ current_diff_content = [line] # 重置当前的 diff 内容,包含当前行
38
+ else:
39
+ if current_file_a and current_file_b:
40
+ current_diff_content.append(line)
41
+
42
+ # 处理最后一个 diff
43
+ if current_file_a and current_file_b:
44
+ ext_a = os.path.splitext(current_file_a)[1]
45
+ ext_b = os.path.splitext(current_file_b)[1]
46
+ if len(preserve_ext_list) == 0 or ext_a in preserve_ext_list and ext_b in preserve_ext_list:
47
+ files.append((current_file_a, current_file_b, '\n'.join(current_diff_content)))
48
+
49
+ return files
50
+
51
+
52
+
53
+ # 拆分一个change为多个变更点
54
+ def split_to_section(file_diff : str) -> list:
55
+ # 使用正则表达式匹配@@区块和其中的变更内容
56
+ # 正则匹配格式:以 @@ 开始,后接变更内容
57
+ pattern = re.compile(r"@@.*?@@(\r?\n?)([\s\S]*?)(?=@@|\Z)", re.MULTILINE)
58
+ change_blocks = []
59
+
60
+ # 匹配所有变更区块
61
+ for match in pattern.finditer(file_diff):
62
+ # 获取变更内容
63
+ block = match.group(0)
64
+ # 按行拆分变更内容
65
+ change_blocks.append(block)
66
+
67
+ return change_blocks
68
+
69
+
70
+
71
+ if __name__ == "__main__":
72
+ # 测试用例
73
+ diff = \
74
+ """diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
75
+ index 71ba18efa15b..867664918715 100644
76
+ --- a/drivers/net/bonding/bond_main.c
77
+ +++ b/drivers/net/bonding/bond_main.c
78
+ @@ -1543,9 +1543,11 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
79
+ bond_set_carrier(bond);
80
+
81
+ if (USES_PRIMARY(bond->params.mode)) {
82
+ + block_netpoll_tx();
83
+ write_lock_bh(&bond->curr_slave_lock);
84
+ bond_select_active_slave(bond);
85
+ write_unlock_bh(&bond->curr_slave_lock);
86
+ + unblock_netpoll_tx();
87
+ }
88
+
89
+ pr_info("%s: enslaving %s as a%s interface with a%s link.\n",
90
+ @@ -1571,10 +1573,12 @@ err_detach:
91
+ if (bond->primary_slave == new_slave)
92
+ bond->primary_slave = NULL;
93
+ if (bond->curr_active_slave == new_slave) {
94
+ + block_netpoll_tx();
95
+ write_lock_bh(&bond->curr_slave_lock);
96
+ bond_change_active_slave(bond, NULL);
97
+ bond_select_active_slave(bond);
98
+ write_unlock_bh(&bond->curr_slave_lock);
99
+ + unblock_netpoll_tx();
100
+ }
101
+ slave_disable_netpoll(new_slave);
102
+
103
+ @@ -2864,9 +2868,12 @@ static int bond_slave_netdev_event(unsigned long event,
104
+ pr_info("%s: Primary slave changed to %s, reselecting active slave.\
105
+ ",
106
+ bond->dev->name, bond->primary_slave ? slave_dev->name :
107
+ "none");
108
+ +
109
+ + block_netpoll_tx();
110
+ write_lock_bh(&bond->curr_slave_lock);
111
+ bond_select_active_slave(bond);
112
+ write_unlock_bh(&bond->curr_slave_lock);
113
+ + unblock_netpoll_tx();
114
+ break;
115
+ case NETDEV_FEAT_CHANGE:
116
+ bond_compute_features(bond);
117
+ """
118
+
119
+ # 提取所有变更块
120
+ changes = split_to_file_diff(diff, ['c'])
121
+ for file_a, file_b, diff_content in changes:
122
+ print(f"a: {file_a}, b: {file_b}")
123
+ print(diff_content)
124
+ print("=" * 50)
125
+
126
+ change_blocks = split_to_section(changes[0][2])
127
+ for idx, block in enumerate(change_blocks):
128
+ print(f"Change Block {idx + 1}:")
129
+ print(block)
130
+ print("-" * 50)
131
+
dataset_eval.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import jsonlines
3
+ import argparse
4
+ from tqdm import tqdm
5
+ import logging
6
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, matthews_corrcoef
7
+
8
+ from backend.section_infer_helper.local_llm_helper import local_llm_helper
9
+ from backend.section_infer_helper.online_llm_helper import online_llm_helper
10
+
11
+
12
+ INCLUDE_MSG = "no"
13
+ BATCH_SIZE = 4
14
+
15
+ # overwrite by environment variables
16
+ INCLUDE_MSG = os.environ.get("INCLUDE_MSG", INCLUDE_MSG)
17
+
18
+
19
+ logging.basicConfig(level=logging.INFO,
20
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
21
+
22
+
23
+ def main(args):
24
+ if args.type == "local":
25
+ helper = local_llm_helper
26
+ helper.load_model(args.model, args.peft)
27
+ elif args.type == "online":
28
+ helper = online_llm_helper
29
+ helper.load_model(args.model, args.url, args.key)
30
+
31
+ labels = []
32
+ predicts = []
33
+ input_prompts = []
34
+ output_text = []
35
+ output_probs = []
36
+
37
+ inputs = []
38
+ with jsonlines.open(args.data, "r") as reader:
39
+ test_data = list(reader)
40
+
41
+ finished_item = []
42
+ if os.path.exists(args.output):
43
+ with jsonlines.open(args.output, "r") as reader:
44
+ for i, item in enumerate(reader):
45
+ finished_item.append((item["commit_id"], item["file_name"]))
46
+ test_data[i] = item
47
+ for section in item["sections"]:
48
+ labels.append(section["related"])
49
+ predicts.append(section["predict"])
50
+ input_prompts.append(section["input_prompt"])
51
+ output_text.append(section["output_text"])
52
+ output_probs.append(section["conf"])
53
+
54
+
55
+ for item in test_data:
56
+ file_name = item["file_name"]
57
+ patch = item["patch"]
58
+ if (item["commit_id"],item["file_name"]) in finished_item:
59
+ print(f"Skip {item['commit_id']}, {item['file_name']}")
60
+ continue
61
+ commit_message = item["commit_message"] if INCLUDE_MSG == "yes" else ""
62
+ for section in item["sections"]:
63
+ section_content = section["section"]
64
+ inputs.append(helper.InputData(file_name, patch, section_content, commit_message))
65
+ labels.append(section["related"])
66
+
67
+ assert len(labels) == 4088, f"Get {len(labels)} labels"
68
+
69
+ try:
70
+ this_input_prompts, this_output_text, this_output_probs = helper.do_infer(inputs, BATCH_SIZE)
71
+ except Exception as e:
72
+ print(f"Error: {e}")
73
+
74
+ input_prompts.extend(this_input_prompts)
75
+ output_text.extend(this_output_text)
76
+ output_probs.extend(this_output_probs)
77
+
78
+ for result in output_text:
79
+ predicts.append("yes" in result.lower())
80
+
81
+ # accuracy = accuracy_score(labels, predicts)
82
+ # precision = precision_score(labels, predicts)
83
+ # recall = recall_score(labels, predicts)
84
+ # f1 = f1_score(labels, predicts)
85
+ # mcc = matthews_corrcoef(labels, predicts)
86
+ # tp, fp, fn, tn = confusion_matrix(labels, predicts).ravel()
87
+ # fpr = fp / (fp + tn + 1e-8)
88
+ # print("=" * 20)
89
+ # print(f"Accuracy: {accuracy}")
90
+ # print(f"Precision: {precision}")
91
+ # print(f"Recall: {recall}")
92
+ # print(f"F1: {f1}")
93
+ # print(f"MCC: {mcc}")
94
+ # print(f"FPR: {fpr}")
95
+ # print("=" * 20)
96
+
97
+ with jsonlines.open(args.output, "w") as writer:
98
+ for item in test_data:
99
+ if len(output_text) < len(item["sections"]):
100
+ logging.info("Not enough output")
101
+ break
102
+ for section in item["sections"]:
103
+ section["input_prompt"] = input_prompts.pop(0)
104
+ section["output_text"] = output_text.pop(0)
105
+ section["predict"] = True if predicts.pop(0) else False
106
+ section["conf"] = output_probs.pop(0)
107
+ writer.write(item)
108
+
109
+
110
+ if __name__ == "__main__":
111
+ parser = argparse.ArgumentParser()
112
+ parser.add_argument("-d", "--data", type=str, required=True, help="Path to the data file")
113
+ parser.add_argument("-t", "--type", type=str, required=True, help="Type of the model", choices=["local", "online"])
114
+ parser.add_argument("-m", "--model", type=str, required=True)
115
+ parser.add_argument("-p", "--peft", type=str, help="Path to the PEFT file")
116
+ parser.add_argument("-u", "--url", type=str, help="URL of the model")
117
+ parser.add_argument("-k", "--key", type=str, help="API key")
118
+ parser.add_argument("-o", "--output", type=str, required=True, help="Path to the output file")
119
+ args = parser.parse_args()
120
+ main(args)
121
+
evaluate/dataset/C_C++_Java_Python/test.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evaluate/result.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,All_Acc,All_F1,All_MCC,C/C++_Acc,C/C++_F1,C/C++_MCC,Java_Acc,Java_F1,Java_MCC,Python_Acc,Python_F1,Python_MCC
2
+ codellama-7b,84.31\%,6.14\%,-2.37\%,85.37\%,8.25\%,0.31\%,80.71\%,2.96\%,-7.70\%,84.84\%,3.49\%,-4.34\%
3
+ qwen-coder-plus,42.78\%,16.10\%,6.26\%,36.25\%,17.19\%,9.79\%,46.47\%,8.27\%,-14.31\%,60.97\%,21.08\%,22.63\%
4
+ qwen-7b,35.36\%,15.29\%,4.30\%,31.55\%,17.33\%,11.32\%,41.71\%,7.30\%,-18.04\%,41.08\%,15.37\%,14.05\%
5
+ qwen-0.5b,81.82\%,8.50\%,-1.08\%,82.40\%,9.62\%,0.30\%,79.00\%,4.29\%,-7.24\%,83.13\%,10.22\%,2.53\%
6
+ patchouli,94.58\%,56.61\%,55.71\%,93.03\%,44.27\%,42.29\%,98.29\%,89.82\%,89.25\%,95.61\%,40.74\%,49.44\%
7
+ patchouli_msg,94.77\%,57.87\%,57.22\%,93.22\%,44.81\%,43.33\%,98.35\%,90.28\%,89.64\%,95.88\%,46.43\%,53.82\%
8
+ patchouli_nomsg,94.40\%,55.36\%,54.22\%,92.83\%,43.75\%,41.33\%,98.24\%,89.36\%,88.86\%,95.34\%,34.62\%,44.66\%
9
+ dsc-7b,67.99\%,10.28\%,-2.72\%,67.32\%,10.38\%,-2.85\%,69.71\%,6.87\%,-8.29\%,68.31\%,13.48\%,6.04\%
10
+ deepseek-chat,36.46\%,16.30\%,7.38\%,37.96\%,17.10\%,9.21\%,35.35\%,19.25\%,9.78\%,32.58\%,9.90\%,-3.32\%
11
+ gpt-3.5-turbo,55.14\%,13.73\%,1.12\%,53.03\%,15.49\%,4.27\%,55.82\%,7.40\%,-12.42\%,61.59\%,14.11\%,7.54\%
evaluate/statistic.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import jsonlines
3
+ from collections import defaultdict
4
+ import pandas as pd
5
+ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef, confusion_matrix
6
+
7
+ RESULT_ROOTS = "./result"
8
+
9
+ LANGUAGE_MAP = {
10
+ "all": "All",
11
+ "C": "C/C++",
12
+ "C++": "C/C++",
13
+ "Java": "Java",
14
+ "Python": "Python",
15
+ }
16
+
17
+ table_dict = {}
18
+
19
+ for method in os.listdir(RESULT_ROOTS):
20
+
21
+ msg_labels = defaultdict(list)
22
+ msg_predicts = defaultdict(list)
23
+ msg_metrics = {}
24
+
25
+ nomsg_labels = defaultdict(list)
26
+ nomsg_predicts = defaultdict(list)
27
+ nomsg_metrics = {}
28
+
29
+ mix_labels = defaultdict(list)
30
+ mix_predicts = defaultdict(list)
31
+ mix_metrics = {}
32
+
33
+ msg_result_file = os.path.join(RESULT_ROOTS, method, "msg.jsonl")
34
+ nomsg_result_file = os.path.join(RESULT_ROOTS, method, "nomsg.jsonl")
35
+
36
+ if not os.path.exists(msg_result_file) or not os.path.exists(nomsg_result_file):
37
+ continue
38
+
39
+ with jsonlines.open(msg_result_file) as reader:
40
+ for item in reader:
41
+ lang = LANGUAGE_MAP[item["language"]]
42
+ for section in item["sections"]:
43
+ msg_labels["All"].append(section['related'])
44
+ msg_predicts["All"].append(section['predict'])
45
+ msg_labels[lang].append(section['related'])
46
+ msg_predicts[lang].append(section['predict'])
47
+ mix_labels["All"].append(section['related'])
48
+ mix_predicts["All"].append(section['predict'])
49
+ mix_labels[lang].append(section['related'])
50
+ mix_predicts[lang].append(section['predict'])
51
+
52
+
53
+ with jsonlines.open(nomsg_result_file) as reader:
54
+ for item in reader:
55
+ lang = LANGUAGE_MAP[item["language"]]
56
+ for section in item["sections"]:
57
+ nomsg_labels["All"].append(section['related'])
58
+ nomsg_predicts["All"].append(section['predict'])
59
+ nomsg_labels[lang].append(section['related'])
60
+ nomsg_predicts[lang].append(section['predict'])
61
+ mix_labels["All"].append(section['related'])
62
+ mix_predicts["All"].append(section['predict'])
63
+ mix_labels[lang].append(section['related'])
64
+ mix_predicts[lang].append(section['predict'])
65
+
66
+
67
+ for lang in LANGUAGE_MAP.values():
68
+ accuracy = accuracy_score(msg_labels[lang], msg_predicts[lang])
69
+ # precision = precision_score(msg_labels[lang], msg_predicts[lang])
70
+ # recall = recall_score(msg_labels[lang], msg_predicts[lang])
71
+ f1 = f1_score(msg_labels[lang], msg_predicts[lang])
72
+ mcc = matthews_corrcoef(msg_labels[lang], msg_predicts[lang])
73
+ tp, fp, tn, fn = confusion_matrix(msg_labels[lang], msg_predicts[lang]).ravel()
74
+ fpr = fp / (fp + tn + 1e-6)
75
+
76
+ msg_metrics.update({
77
+ f"{lang}_Acc": f"{accuracy * 100:.2f}\\%",
78
+ # f"{lang}_P": f"{precision * 100:.2f}%",
79
+ # f"{lang}_R": f"{recall * 100:.2f}%",
80
+ f"{lang}_F1": f"{f1 * 100:.2f}\\%",
81
+ # f"{lang}_FPR": f"{fpr * 100:.2f}\\%",
82
+ f"{lang}_MCC": f"{mcc * 100:.2f}\\%"
83
+ })
84
+
85
+ accuracy = accuracy_score(nomsg_labels[lang], nomsg_predicts[lang])
86
+ # precision = precision_score(nomsg_labels[lang], nomsg_predicts[lang])
87
+ # recall = recall_score(nomsg_labels[lang], nomsg_predicts[lang])
88
+ f1 = f1_score(nomsg_labels[lang], nomsg_predicts[lang])
89
+ mcc = matthews_corrcoef(nomsg_labels[lang], nomsg_predicts[lang])
90
+ tp, fp, tn, fn = confusion_matrix(nomsg_labels[lang], nomsg_predicts[lang]).ravel()
91
+ fpr = fp / (fp + tn + 1e-6)
92
+
93
+ nomsg_metrics.update({
94
+ f"{lang}_Acc": f"{accuracy * 100:.2f}\\%",
95
+ # f"{lang}_P": f"{precision * 100:.2f}%",
96
+ # f"{lang}_R": f"{recall * 100:.2f}%",
97
+ f"{lang}_F1": f"{f1 * 100:.2f}\\%",
98
+ # f"{lang}_FPR": f"{fpr * 100:.2f}\\%",
99
+ f"{lang}_MCC": f"{mcc * 100:.2f}\\%"
100
+ })
101
+
102
+ accuracy = accuracy_score(mix_labels[lang], mix_predicts[lang])
103
+ # precision = precision_score(mix_labels[lang], mix_predicts[lang])
104
+ # recall = recall_score(mix_labels[lang], mix_predicts[lang])
105
+ f1 = f1_score(mix_labels[lang], mix_predicts[lang])
106
+ mcc = matthews_corrcoef(mix_labels[lang], mix_predicts[lang])
107
+ tp, fp, tn, fn = confusion_matrix(mix_labels[lang], mix_predicts[lang]).ravel()
108
+ fpr = fp / (fp + tn + 1e-6)
109
+
110
+ mix_metrics.update({
111
+ f"{lang}_Acc": f"{accuracy * 100:.2f}\\%",
112
+ # f"{lang}_P": f"{precision * 100:.2f}%",
113
+ # f"{lang}_R": f"{recall * 100:.2f}%",
114
+ f"{lang}_F1": f"{f1 * 100:.2f}\\%",
115
+ # f"{lang}_FPR": f"{fpr * 100:.2f}\\%",
116
+ f"{lang}_MCC": f"{mcc * 100:.2f}\\%"
117
+ })
118
+
119
+ table_dict[method] = mix_metrics
120
+ if method == "patchouli":
121
+ table_dict[f"{method}_msg"] = msg_metrics
122
+ table_dict[f"{method}_nomsg"] = nomsg_metrics
123
+
124
+
125
+ df = pd.DataFrame(table_dict).T
126
+ df.to_csv("result.csv")
127
+
evaluate_local.sh ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export HF_HUB_OFFLINE=1
2
+ export DATASET_ROOT="./evaluate/dataset/C_C++_Java_Python"
3
+ export RESULT_ROOT="./evaluate/result"
4
+ export EVAL_SCRIPT="./dataset_eval.py"
5
+
6
+ export MODEL="Qwen/Qwen2.5-Coder-0.5B-Instruct"
7
+ export PEFT="./backend/model/PEFT/patchouli-qwc2.5-0.5b"
8
+ export OUTPUT_DIR="$RESULT_ROOT/patchouli"
9
+
10
+ mkdir -p $OUTPUT_DIR
11
+ export INCLUDE_MSG="no"
12
+ python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -p $PEFT -o "$OUTPUT_DIR/nomsg.jsonl" 2>&1 | tee $OUTPUT_DIR/nomsg.log
13
+ export INCLUDE_MSG="yes"
14
+ python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -p $PEFT -o "$OUTPUT_DIR/msg.jsonl" 2>&1 | tee $OUTPUT_DIR/msg.log
15
+
16
+
17
+ export MODEL="Qwen/Qwen2.5-Coder-0.5B-Instruct"
18
+ export OUTPUT_DIR="$RESULT_ROOT/qwen-0.5b"
19
+
20
+ mkdir -p $OUTPUT_DIR
21
+ export INCLUDE_MSG="no"
22
+ python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -o "$OUTPUT_DIR/nomsg.jsonl" 2>&1 | tee $OUTPUT_DIR/nomsg.log
23
+ export INCLUDE_MSG="yes"
24
+ python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -o "$OUTPUT_DIR/msg.jsonl" 2>&1 | tee $OUTPUT_DIR/msg.log
25
+
26
+
27
+ export MODEL="Qwen/Qwen2.5-Coder-7B-Instruct"
28
+ export OUTPUT_DIR="$RESULT_ROOT/qwen-7b"
29
+
30
+ mkdir -p $OUTPUT_DIR
31
+ export INCLUDE_MSG="no"
32
+ python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -o "$OUTPUT_DIR/nomsg.jsonl" 2>&1 | tee $OUTPUT_DIR/nomsg.log
33
+ export INCLUDE_MSG="yes"
34
+ python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -o "$OUTPUT_DIR/msg.jsonl" 2>&1 | tee $OUTPUT_DIR/msg.log
35
+
36
+
37
+ export MODEL="deepseek-ai/deepseek-coder-7b-instruct-v1.5"
38
+ export OUTPUT_DIR="$RESULT_ROOT/dsc-7b"
39
+
40
+ mkdir -p $OUTPUT_DIR
41
+ export INCLUDE_MSG="no"
42
+ python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -o "$OUTPUT_DIR/nomsg.jsonl" 2>&1 | tee $OUTPUT_DIR/nomsg.log
43
+ export INCLUDE_MSG="yes"
44
+ python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -o "$OUTPUT_DIR/msg.jsonl" 2>&1 | tee $OUTPUT_DIR/msg.log
45
+
46
+
47
+ export MODEL="codellama/CodeLlama-7b-Instruct-hf"
48
+ export OUTPUT_DIR="$RESULT_ROOT/codellama-7b"
49
+
50
+ mkdir -p $OUTPUT_DIR
51
+ export INCLUDE_MSG="no"
52
+ python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -o "$OUTPUT_DIR/nomsg.jsonl" 2>&1 | tee $OUTPUT_DIR/nomsg.log
53
+ export INCLUDE_MSG="yes"
54
+ python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -o "$OUTPUT_DIR/msg.jsonl" 2>&1 | tee $OUTPUT_DIR/msg.log
evaluate_online.sh ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export HF_HUB_OFFLINE=1
2
+ export DATASET_ROOT="./evaluate/dataset/C_C++_Java_Python"
3
+ export RESULT_ROOT="./evaluate/result"
4
+ export EVAL_SCRIPT="./dataset_eval.py"
5
+
6
+ GPT-3.5-turbo
7
+ export KEY=""
8
+ export URL="https://api.chatanywhere.tech/v1/"
9
+ export MODEL="gpt-3.5-turbo"
10
+
11
+ mkdir -p $RESULT_ROOT/$MODEL
12
+ export INCLUDE_MSG=no
13
+ python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t online -m $MODEL -u $URL -k $KEY -o "$RESULT_ROOT/$MODEL/nomsg.jsonl" 2>&1 | tee $RESULT_ROOT/$MODEL/nomsg.log
14
+ export INCLUDE_MSG=yes
15
+ python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t online -m $MODEL -u $URL -k $KEY -o "$RESULT_ROOT/$MODEL/msg.jsonl" 2>&1 | tee $RESULT_ROOT/$MODEL/msg.log
16
+
17
+ # DeepSeek-V3
18
+ export KEY=""
19
+ export URL="https://api.deepseek.com/v1"
20
+ export MODEL="deepseek-chat"
21
+
22
+ mkdir -p $RESULT_ROOT/$MODEL
23
+ export INCLUDE_MSG=no
24
+ python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t online -m $MODEL -u $URL -k $KEY -o "$RESULT_ROOT/$MODEL/nomsg.jsonl" 2>&1 | tee $RESULT_ROOT/$MODEL/nomsg.log
25
+ export INCLUDE_MSG=yes
26
+ python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t online -m $MODEL -u $URL -k $KEY -o "$RESULT_ROOT/$MODEL/msg.jsonl" 2>&1 | tee $RESULT_ROOT/$MODEL/msg.log
27
+
28
+ qwen-coder-plus
29
+ export KEY=""
30
+ export URL="https://dashscope.aliyuncs.com/compatible-mode/v1"
31
+ export MODEL="qwen-coder-plus"
32
+
33
+ mkdir -p $RESULT_ROOT/$MODEL
34
+ export INCLUDE_MSG=no
35
+ python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t online -m $MODEL -u $URL -k $KEY -o "$RESULT_ROOT/$MODEL/nomsg.jsonl" 2>&1 | tee $RESULT_ROOT/$MODEL/nomsg.log
36
+ export INCLUDE_MSG=yes
37
+ python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t online -m $MODEL -u $URL -k $KEY -o "$RESULT_ROOT/$MODEL/msg.jsonl" 2>&1 | tee $RESULT_ROOT/$MODEL/msg.log
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio==5.12.0
2
+ jsonlines==4.0.0
3
+ openai==1.59.6
4
+ peft==0.12.0
5
+ scikit_learn==1.6.1
6
+ torch==2.5.1
7
+ tqdm==4.67.1
8
+ transformers==4.46.1
9
+ tensorboard