Spaces:

traveler514
/

patchouli

Running

App Files Files Community

traveler514 commited on 18 days ago

Commit

81a794d

0 Parent(s):

first commit

Browse files

Files changed (29) hide show

.gitattributes +2 -0
.gitignore +174 -0
README.md +9 -0
app.py +164 -0
backend/cwe_infer_helper.py +109 -0
backend/examples.json +14 -0
backend/inference.py +32 -0
backend/model/PEFT/patchouli-qwc2.5-0.5b/adapter_config.json +34 -0
backend/model/PEFT/patchouli-qwc2.5-0.5b/adapter_model.safetensors +3 -0
backend/model/PEFT/patchouli-qwc2.5-0.5b/added_tokens.json +24 -0
backend/model/PEFT/patchouli-qwc2.5-0.5b/merges.txt +0 -0
backend/model/PEFT/patchouli-qwc2.5-0.5b/special_tokens_map.json +31 -0
backend/model/PEFT/patchouli-qwc2.5-0.5b/tokenizer.json +3 -0
backend/model/PEFT/patchouli-qwc2.5-0.5b/tokenizer_config.json +208 -0
backend/model/PEFT/patchouli-qwc2.5-0.5b/vocab.json +0 -0
backend/model/cwe-cls/patchouli-unixcoder/config.json +57 -0
backend/model/cwe-cls/patchouli-unixcoder/model.safetensors +3 -0
backend/section_infer_helper/base_helper.py +51 -0
backend/section_infer_helper/local_llm_helper.py +197 -0
backend/section_infer_helper/online_llm_helper.py +137 -0
backend/section_infer_helper/random_helper.py +39 -0
backend/utils/data_process.py +131 -0
dataset_eval.py +121 -0
evaluate/dataset/C_C++_Java_Python/test.jsonl +0 -0
evaluate/result.csv +11 -0
evaluate/statistic.py +127 -0
evaluate_local.sh +54 -0
evaluate_online.sh +37 -0
requirements.txt +9 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.safetensors filter=lfs diff=lfs merge=lfs -text
2	+ **/tokenizer.json filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,174 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc
+.vscode/
+evaluate/result/

README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+---
+title: PATCHOULI
+pinned: true
+sdk: gradio
+---
+# PATCHOULI
+PATCHOULI (Patch Observing and Untangling Engine) is an easy-to-use software security patch analyzing tool.

app.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import os
+import json
+import logging
+import gradio as gr
+from backend.inference import section_infer, cwe_infer, PREDEF_MODEL_MAP, LOCAL_MODEL_PEFT_MAP, PREDEF_CWE_MODEL
+APP_TITLE = "PATCHOULI"
+STYLE_APP_TITLE = '<div style="text-align: center; font-weight: bold; font-family: Arial, sans-serif; font-size: 44px;">' + \
+    '<span style="color: #14e166">PATCH</span> ' + \
+    '<span style="color: #14e166">O</span>bserving ' + \
+    'and ' + \
+    '<span style="color: #14e166">U</span>ntang<span style="color: #14e166">l</span>ing ' + \
+    'Eng<span style="color: #14e166">i</span>ne' + \
+    '</div>'
+# from 0.00 to 1.00, 41 colors
+NONVUL_GRADIENT_COLORS = ["#d3f8d6",
+"#d3f8d6", "#d0f8d3", "#ccf7d0", "#c9f7cd", "#c6f6cb", "#c2f6c8", "#bff5c5", "#bcf5c2", "#b8f4bf", "#b5f4bc",
+"#b1f3ba", "#aef2b7", "#aaf2b4", "#a7f1b1", "#a3f1ae", "#9ff0ab", "#9cf0a9", "#98efa6", "#94efa3", "#90eea0",
+"#8ced9d", "#88ed9a", "#84ec98", "#80ec95", "#7ceb92", "#78ea8f", "#73ea8c", "#6fe989", "#6ae886", "#65e883",
+"#60e781", "#5ae67e", "#55e67b", "#4fe578", "#48e475", "#41e472", "#39e36f", "#30e26c", "#25e269", "#14e166"
+]
+# from 0.00 to 1.00, 41 colors
+VUL_GRADIENT_COLORS = ["#d3f8d6",
+"#fdcfc9", "#fdccc5", "#fcc9c2", "#fcc5bf", "#fcc2bb", "#fbbfb8", "#fbbcb4", "#fab9b1", "#fab5ad", "#f9b2aa",
+"#f8afa7", "#f8aca3", "#f7a8a0", "#f7a59c", "#f6a299", "#f59f96", "#f59c92", "#f4988f", "#f3958c", "#f29288",
+"#f18e85", "#f18b82", "#f0887f", "#ef847c", "#ee8178", "#ed7e75", "#ec7a72", "#eb776f", "#ea736c", "#e97068",
+"#e86c65", "#e76962", "#e6655f", "#e5615c", "#e45e59", "#e35a56", "#e25653", "#e05250", "#df4e4d", "#de4a4a"
+]
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logging.getLogger("httpx").setLevel(logging.WARNING)
+def generate_color_map():
+    color_map = {}
+    for i in range(0, 101):
+        color_map[f"non-vul-fixing: {i/100:0.2f}"] = NONVUL_GRADIENT_COLORS[int(i * 0.4)]
+        color_map[f"vul-fixing: {i/100:0.2f}"] = VUL_GRADIENT_COLORS[int(i * 0.4)]
+    return color_map
+def on_submit(diff_code, patch_message, cwe_model, section_model_type, progress = gr.Progress(track_tqdm=True), *model_config):
+    if diff_code == "":
+        return gr.skip(), gr.skip(), gr.skip()
+    try:
+        section_results = section_infer(diff_code, patch_message, section_model_type, *model_config)
+    except Exception as e:
+        raise gr.Error(f"Error: {str(e)}")
+    vul_cnt = 0
+    for file_results in section_results.values():
+        for item in file_results:
+            if item["predict"] == 1:
+                vul_cnt += 1
+    label_text = f"Vul-fixing patch" if vul_cnt > 0 \
+            else f"Non-vul-fixing patch"
+    color = "#de4a4a" if vul_cnt > 0 else "#14e166"
+    patch_category_label = gr.Label(value = label_text, color = color)
+    if cwe_model == "":
+        cwe_cls_result = "No model selected"
+    elif vul_cnt == 0:
+        cwe_cls_result = "No vulnerability found"
+    else:
+        cwe_cls_result = cwe_infer(diff_code, patch_message, cwe_model)
+    return patch_category_label, section_results, cwe_cls_result
+with gr.Blocks(title = APP_TITLE, fill_width=True) as demo:
+    section_results_state = gr.State({})
+    cls_results_state = gr.State({})
+    title = gr.HTML(STYLE_APP_TITLE)
+    with gr.Row() as main_block:
+        with gr.Column(scale=1) as input_block:
+            diff_codebox = gr.Code(label="Input git diff here", max_lines=10)
+            with gr.Accordion("Patch message (optional)", open=False):
+                message_textbox = gr.Textbox(label="Patch message", placeholder="Enter patch message here", container=False, lines=2, max_lines=5)
+            cwe_model_selector = gr.Dropdown(PREDEF_CWE_MODEL, label="Select vulnerability type classifier", allow_custom_value=True)
+            with gr.Tabs(selected=0) as model_type_tabs:
+                MODEL_TYPE_NAMES = list(PREDEF_MODEL_MAP.keys())
+                with gr.Tab(MODEL_TYPE_NAMES[0]) as local_llm_tab:
+                    local_model_selector = gr.Dropdown(PREDEF_MODEL_MAP[MODEL_TYPE_NAMES[0]], label="Select model", allow_custom_value=True)
+                    local_peft_selector = gr.Dropdown(LOCAL_MODEL_PEFT_MAP[local_model_selector.value], label="Select PEFT model (optional)", allow_custom_value=True)
+                    local_submit_btn = gr.Button("Run", variant="primary")
+                with gr.Tab(MODEL_TYPE_NAMES[1]) as online_llm_tab:
+                    online_model_selector = gr.Dropdown(PREDEF_MODEL_MAP[MODEL_TYPE_NAMES[1]], label="Select model", allow_custom_value=True)
+                    online_api_url_textbox = gr.Textbox(label="API URL")
+                    online_api_key_textbox = gr.Textbox(label="API Key", placeholder="We won't store your API key", value=os.getenv("ONLINE_API_KEY"), type="password")
+                    online_submit_btn = gr.Button("Run", variant="primary")
+            section_model_type = gr.State(model_type_tabs.children[0].label)
+            with gr.Accordion("Load examples", open=False):
+                with open("./backend/examples.json", "r") as f:
+                    examples = json.load(f)
+                gr.Button("Load example 1", size='sm').click(lambda : examples[0], outputs=[diff_codebox, message_textbox])
+                gr.Button("Load example 2", size='sm').click(lambda : examples[1], outputs=[diff_codebox, message_textbox])
+                gr.Button("Load example 3", size='sm').click(lambda : examples[2], outputs=[diff_codebox, message_textbox])
+        with gr.Column(scale=2) as section_result_block:
+            @gr.render(inputs=section_results_state, triggers=[section_results_state.change, demo.load])
+            def display_result(section_results):
+                if not section_results or len(section_results) == 0:
+                    with gr.Tab("File tabs"):
+                        gr.Markdown("No results")
+                else:
+                    for file_name, file_results in section_results.items():
+                        with gr.Tab(file_name) as file_tab:
+                            highlited_results = []
+                            full_color_map = generate_color_map()
+                            this_color_map = {}
+                            for item in file_results:
+                                predict_result = {-1: 'error', 0: 'non-vul-fixing', 1: 'vul-fixing'}
+                                text_label = f"{predict_result[item['predict']]}: {item['conf']:0.2f}"
+                                this_color_map[text_label] = full_color_map[text_label]
+                                highlited_results.append((
+                                    item["section"],
+                                    text_label
+                                ))
+                            gr.HighlightedText(
+                                highlited_results,
+                                label="Results",
+                                color_map=this_color_map
+                            )
+        with gr.Column(scale=1) as result_block:
+            patch_category_label = gr.Label(value = "No results", label = "Result of the whole patch")
+            def update_vul_type_label(cls_results):
+                return gr.Label(cls_results)
+            vul_type_label = gr.Label(update_vul_type_label, label = "Possible fixed vulnerability type", inputs = [cls_results_state])
+    def update_model_type_state(evt: gr.SelectData):
+        return evt.value
+    model_type_tabs.select(update_model_type_state, outputs = [section_model_type])
+    def update_support_peft(base_model):
+        return gr.Dropdown(LOCAL_MODEL_PEFT_MAP[base_model], value = LOCAL_MODEL_PEFT_MAP[base_model][0])
+    local_model_selector.change(update_support_peft, inputs=[local_model_selector], outputs = [local_peft_selector])
+    local_submit_btn.click(fn = on_submit,
+                           inputs = [diff_codebox, message_textbox, cwe_model_selector, section_model_type, local_model_selector, local_peft_selector],
+                           outputs = [patch_category_label, section_results_state, cls_results_state])
+    online_submit_btn.click(fn = on_submit,
+                            inputs = [diff_codebox, message_textbox, cwe_model_selector, section_model_type, online_model_selector, online_api_url_textbox, online_api_key_textbox],
+                            outputs = [patch_category_label, section_results_state, cls_results_state])
+demo.launch()

backend/cwe_infer_helper.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import logging
+from collections import defaultdict
+from transformers import pipeline, AutoTokenizer
+logger = logging.getLogger(__name__)
+class CweInferHelper():
+    TOP_K = 5
+    MAX_LENGTH = 1024
+    MODEL_CONFIG = defaultdict(lambda model: {
+        "model_name_or_path": model,
+        "tokenizer_name": model
+    })
+    MODEL_CONFIG.update({
+        "patchouli-cwe-UniXcoder": {
+            "model_name_or_path": "./backend/model/cwe-cls/patchouli-unixcoder",
+            "tokenizer_name": "microsoft/unixcoder-base-nine"
+        }
+    })
+    PREDEF_MODEL = list(MODEL_CONFIG.keys())
+    def __init__(self):
+        self.model = None
+        self.classifier = None
+        self.tokenizer = None
+    def load_model(self, model):
+        logger.info(f"Loading CWE classify model: {model}")
+        if model == self.model:
+            return
+        self.model = model
+        model_name_or_path = self.MODEL_CONFIG[model]["model_name_or_path"]
+        tokenizer_name = self.MODEL_CONFIG[model]["tokenizer_name"]
+        self.classifier = pipeline("text-classification", model=model_name_or_path, tokenizer=tokenizer_name, device_map="auto")
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+    def infer(self, diff_code, patch_message = None):
+        if self.classifier is None:
+            raise ValueError("Model is not loaded")
+        input_text = ""
+        if patch_message is not None and patch_message != "":
+            input_text += f"[MESSAGE]\n{patch_message}\n"
+        input_text += f"[PATCH]\n{diff_code}"
+        logger.info(f"Classifying CWE for diff code")
+        input_ids = self.tokenizer(input_text, max_length=CweInferHelper.MAX_LENGTH-10, padding="max_length", truncation=True).input_ids
+        input_text = self.tokenizer.decode(input_ids)
+        result = self.classifier(input_text, top_k = self.TOP_K)
+        result = {item["label"]: item["score"] for item in result}
+        return result
+cwe_infer_helper = CweInferHelper()
+if __name__ == "__main__":
+    code = """diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
+index 6bde12da2fe003..c37ac2d7bec44d 100644
+--- a/net/netfilter/ipvs/ip_vs_ctl.c
++++ b/net/netfilter/ipvs/ip_vs_ctl.c
+@@ -2077,6 +2077,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+ 	if (!capable(CAP_NET_ADMIN))
+ 		return -EPERM;
++	if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
++		return -EINVAL;
++	if (len < 0 || len >  MAX_ARG_LEN)
++		return -EINVAL;
+ 	if (len != set_arglen[SET_CMDID(cmd)]) {
+ 		pr_err("set_ctl: len %u != %u\n",
+ 		       len, set_arglen[SET_CMDID(cmd)]);
+@@ -2352,17 +2356,25 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+ {
+ 	unsigned char arg[128];
+ 	int ret = 0;
++	unsigned int copylen;
+ 	if (!capable(CAP_NET_ADMIN))
+ 		return -EPERM;
++	if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
++		return -EINVAL;
++
+ 	if (*len < get_arglen[GET_CMDID(cmd)]) {
+ 		pr_err("get_ctl: len %u < %u\n",
+ 		       *len, get_arglen[GET_CMDID(cmd)]);
+ 		return -EINVAL;
+ 	}
+-	if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
++	copylen = get_arglen[GET_CMDID(cmd)];
++	if (copylen > 128)
++		return -EINVAL;
++
++	if (copy_from_user(arg, user, copylen) != 0)
+ 		return -EFAULT;
+ 	if (mutex_lock_interruptible(&__ip_vs_mutex))
+"""
+    cwe_infer_helper.load_model("patchouli")
+    result = cwe_infer_helper.infer(code)
+    print(result)

backend/examples.json ADDED Viewed

	@@ -0,0 +1,14 @@

+[
+    [
+        "diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c\nindex ec83f413a7ed19..88a565f130a5a2 100644\n--- a/net/wireless/nl80211.c\n+++ b/net/wireless/nl80211.c\n@@ -3406,12 +3406,12 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)\n \ti = 0;\n \tif (info->attrs[NL80211_ATTR_SCAN_SSIDS]) {\n \t\tnla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp) {\n+\t\t\trequest->ssids[i].ssid_len = nla_len(attr);\n \t\t\tif (request->ssids[i].ssid_len > IEEE80211_MAX_SSID_LEN) {\n \t\t\t\terr = -EINVAL;\n \t\t\t\tgoto out_free;\n \t\t\t}\n \t\t\tmemcpy(request->ssids[i].ssid, nla_data(attr), nla_len(attr));\n-\t\t\trequest->ssids[i].ssid_len = nla_len(attr);\n \t\t\ti++;\n \t\t}\n \t}\n@@ -3572,6 +3572,7 @@ static int nl80211_start_sched_scan(struct sk_buff *skb,\n \tif (info->attrs[NL80211_ATTR_SCAN_SSIDS]) {\n \t\tnla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS],\n \t\t\t\t    tmp) {\n+\t\t\trequest->ssids[i].ssid_len = nla_len(attr);\n \t\t\tif (request->ssids[i].ssid_len >\n \t\t\t    IEEE80211_MAX_SSID_LEN) {\n \t\t\t\terr = -EINVAL;\n@@ -3579,7 +3580,6 @@ static int nl80211_start_sched_scan(struct sk_buff *skb,\n \t\t\t}\n \t\t\tmemcpy(request->ssids[i].ssid, nla_data(attr),\n \t\t\t       nla_len(attr));\n-\t\t\trequest->ssids[i].ssid_len = nla_len(attr);\n \t\t\ti++;\n \t\t}\n \t}",
+        "nl80211: fix check for valid SSID size in scan operations\nIn both trigger_scan and sched_scan operations, we were checking for\nthe SSID length before assigning the value correctly.  Since the\nmemory was just kzalloc'ed, the check was always failing and SSID with\nover 32 characters were allowed to go through.\n\nThis was causing a buffer overflow when copying the actual SSID to the\nproper place.\n\nThis bug has been there since 2.6.29-rc4.\n\nCc: [email protected]\nSigned-off-by: Luciano Coelho <[email protected]>\nSigned-off-by: John W. Linville <[email protected]>"
+    ],
+    [
+        "diff --git a/src/include/utils/datetime.h b/src/include/utils/datetime.h\nindex 12f1e7753c..b10648269f 100644\n--- a/src/include/utils/datetime.h\n+++ b/src/include/utils/datetime.h\n@@ -188,12 +188,17 @@ struct tzEntry;\n #define DTK_DATE_M\t\t(DTK_M(YEAR) | DTK_M(MONTH) | DTK_M(DAY))\n #define DTK_TIME_M\t\t(DTK_M(HOUR) | DTK_M(MINUTE) | DTK_ALL_SECS_M)\n \n-#define MAXDATELEN\t\t63\t\t/* maximum possible length of an input date\n-\t\t\t\t\t\t\t\t * string (not counting tr. null) */\n-#define MAXDATEFIELDS\t25\t\t/* maximum possible number of fields in a date\n-\t\t\t\t\t\t\t\t * string */\n-#define TOKMAXLEN\t\t10\t\t/* only this many chars are stored in\n-\t\t\t\t\t\t\t\t * datetktbl */\n+/*\n+ * Working buffer size for input and output of interval, timestamp, etc.\n+ * Inputs that need more working space will be rejected early.  Longer outputs\n+ * will overrun buffers, so this must suffice for all possible output.  As of\n+ * this writing, interval_out() needs the most space at ~90 bytes.\n+ */\n+#define MAXDATELEN\t\t128\n+/* maximum possible number of fields in a date string */\n+#define MAXDATEFIELDS\t25\n+/* only this many chars are stored in datetktbl */\n+#define TOKMAXLEN\t\t10\n \n /* keep this struct small; it gets used a lot */\n typedef struct\ndiff --git a/src/interfaces/ecpg/pgtypeslib/datetime.c b/src/interfaces/ecpg/pgtypeslib/datetime.c\nindex 6600759220..a271cdd7d1 100644\n--- a/src/interfaces/ecpg/pgtypeslib/datetime.c\n+++ b/src/interfaces/ecpg/pgtypeslib/datetime.c\n@@ -60,14 +60,14 @@ PGTYPESdate_from_asc(char *str, char **endptr)\n \tint\t\t\tnf;\n \tchar\t   *field[MAXDATEFIELDS];\n \tint\t\t\tftype[MAXDATEFIELDS];\n-\tchar\t\tlowstr[MAXDATELEN + 1];\n+\tchar\t\tlowstr[MAXDATELEN + MAXDATEFIELDS];\n \tchar\t   *realptr;\n \tchar\t  **ptr = (endptr != NULL) ? endptr : &realptr;\n \n \tbool\t\tEuroDates = FALSE;\n \n \terrno = 0;\n-\tif (strlen(str) >= sizeof(lowstr))\n+\tif (strlen(str) > MAXDATELEN)\n \t{\n \t\terrno = PGTYPES_DATE_BAD_DATE;\n \t\treturn INT_MIN;\ndiff --git a/src/interfaces/ecpg/pgtypeslib/dt.h b/src/interfaces/ecpg/pgtypeslib/dt.h\nindex d7a1935516..3a50d1410e 100644\n--- a/src/interfaces/ecpg/pgtypeslib/dt.h\n+++ b/src/interfaces/ecpg/pgtypeslib/dt.h\n@@ -192,12 +192,17 @@ typedef double fsec_t;\n #define DTK_DATE_M\t\t(DTK_M(YEAR) | DTK_M(MONTH) | DTK_M(DAY))\n #define DTK_TIME_M\t\t(DTK_M(HOUR) | DTK_M(MINUTE) | DTK_M(SECOND))\n \n-#define MAXDATELEN\t\t63\t\t/* maximum possible length of an input date\n-\t\t\t\t\t\t\t\t * string (not counting tr. null) */\n-#define MAXDATEFIELDS\t25\t\t/* maximum possible number of fields in a date\n-\t\t\t\t\t\t\t\t * string */\n-#define TOKMAXLEN\t\t10\t\t/* only this many chars are stored in\n-\t\t\t\t\t\t\t\t * datetktbl */\n+/*\n+ * Working buffer size for input and output of interval, timestamp, etc.\n+ * Inputs that need more working space will be rejected early.  Longer outputs\n+ * will overrun buffers, so this must suffice for all possible output.  As of\n+ * this writing, PGTYPESinterval_to_asc() needs the most space at ~90 bytes.\n+ */\n+#define MAXDATELEN\t\t128\n+/* maximum possible number of fields in a date string */\n+#define MAXDATEFIELDS\t25\n+/* only this many chars are stored in datetktbl */\n+#define TOKMAXLEN\t\t10\n \n /* keep this struct small; it gets used a lot */\n typedef struct\ndiff --git a/src/interfaces/ecpg/pgtypeslib/dt_common.c b/src/interfaces/ecpg/pgtypeslib/dt_common.c\nindex 112538ed50..c5d91ed922 100644\n--- a/src/interfaces/ecpg/pgtypeslib/dt_common.c\n+++ b/src/interfaces/ecpg/pgtypeslib/dt_common.c\n@@ -1171,15 +1171,22 @@ DecodeNumberField(int len, char *str, int fmask,\n \tif ((cp = strchr(str, '.')) != NULL)\n \t{\n #ifdef HAVE_INT64_TIMESTAMP\n-\t\tchar\t\tfstr[MAXDATELEN + 1];\n+\t\tchar\t\tfstr[7];\n+\t\tint\t\t\ti;\n+\n+\t\tcp++;\n \n \t\t/*\n \t\t * OK, we have at most six digits to care about. Let's construct a\n-\t\t * string and then do the conversion to an integer.\n+\t\t * string with those digits, zero-padded on the right, and then do\n+\t\t * the conversion to an integer.\n+\t\t *\n+\t\t * XXX This truncates the seventh digit, unlike rounding it as do\n+\t\t * the backend and the !HAVE_INT64_TIMESTAMP case.\n \t\t */\n-\t\tstrcpy(fstr, (cp + 1));\n-\t\tstrcpy(fstr + strlen(fstr), \"000000\");\n-\t\t*(fstr + 6) = '\\0';\n+\t\tfor (i = 0; i < 6; i++)\n+\t\t\tfstr[i] = *cp != '\\0' ? *cp++ : '0';\n+\t\tfstr[i] = '\\0';\n \t\t*fsec = strtol(fstr, NULL, 10);\n #else\n \t\t*fsec = strtod(cp, NULL);\n@@ -1531,15 +1538,22 @@ DecodeTime(char *str, int *tmask, struct tm * tm, fsec_t *fsec)\n \t\telse if (*cp == '.')\n \t\t{\n #ifdef HAVE_INT64_TIMESTAMP\n-\t\t\tchar\t\tfstr[MAXDATELEN + 1];\n+\t\t\tchar\t\tfstr[7];\n+\t\t\tint\t\t\ti;\n+\n+\t\t\tcp++;\n \n \t\t\t/*\n-\t\t\t * OK, we have at most six digits to work with. Let's construct a\n-\t\t\t * string and then do the conversion to an integer.\n+\t\t\t * OK, we have at most six digits to care about. Let's construct a\n+\t\t\t * string with those digits, zero-padded on the right, and then do\n+\t\t\t * the conversion to an integer.\n+\t\t\t *\n+\t\t\t * XXX This truncates the seventh digit, unlike rounding it as do\n+\t\t\t * the backend and the !HAVE_INT64_TIMESTAMP case.\n \t\t\t */\n-\t\t\tstrncpy(fstr, (cp + 1), 7);\n-\t\t\tstrcpy(fstr + strlen(fstr), \"000000\");\n-\t\t\t*(fstr + 6) = '\\0';\n+\t\t\tfor (i = 0; i < 6; i++)\n+\t\t\t\tfstr[i] = *cp != '\\0' ? *cp++ : '0';\n+\t\t\tfstr[i] = '\\0';\n \t\t\t*fsec = strtol(fstr, &cp, 10);\n #else\n \t\t\tstr = cp;\n@@ -1665,6 +1679,9 @@ DecodePosixTimezone(char *str, int *tzp)\n  *\tDTK_NUMBER can hold date fields (yy.ddd)\n  *\tDTK_STRING can hold months (January) and time zones (PST)\n  *\tDTK_DATE can hold Posix time zones (GMT-8)\n+ *\n+ * The \"lowstr\" work buffer must have at least strlen(timestr) + MAXDATEFIELDS\n+ * bytes of space.  On output, field[] entries will point into it.\n  */\n int\n ParseDateTime(char *timestr, char *lowstr,\n@@ -1677,7 +1694,10 @@ ParseDateTime(char *timestr, char *lowstr,\n \t/* outer loop through fields */\n \twhile (*(*endstr) != '\\0')\n \t{\n+\t\t/* Record start of current field */\n \t\tfield[nf] = lp;\n+\t\tif (nf >= MAXDATEFIELDS)\n+\t\t\treturn -1;\n \n \t\t/* leading digit? then date or time */\n \t\tif (isdigit((unsigned char) *(*endstr)))\n@@ -1818,8 +1838,6 @@ ParseDateTime(char *timestr, char *lowstr,\n \t\t/* force in a delimiter after each field */\n \t\t*lp++ = '\\0';\n \t\tnf++;\n-\t\tif (nf > MAXDATEFIELDS)\n-\t\t\treturn -1;\n \t}\n \n \t*numfields = nf;\ndiff --git a/src/interfaces/ecpg/pgtypeslib/interval.c b/src/interfaces/ecpg/pgtypeslib/interval.c\nindex 6d0926882e..d0dee16690 100644\n--- a/src/interfaces/ecpg/pgtypeslib/interval.c\n+++ b/src/interfaces/ecpg/pgtypeslib/interval.c\n@@ -1094,7 +1094,7 @@ PGTYPESinterval_from_asc(char *str, char **endptr)\n \ttm->tm_sec = 0;\n \tfsec = 0;\n \n-\tif (strlen(str) >= sizeof(lowstr))\n+\tif (strlen(str) > MAXDATELEN)\n \t{\n \t\terrno = PGTYPES_INTVL_BAD_INTERVAL;\n \t\treturn NULL;\ndiff --git a/src/interfaces/ecpg/pgtypeslib/timestamp.c b/src/interfaces/ecpg/pgtypeslib/timestamp.c\nindex a560af3c38..b0f9bf1521 100644\n--- a/src/interfaces/ecpg/pgtypeslib/timestamp.c\n+++ b/src/interfaces/ecpg/pgtypeslib/timestamp.c\n@@ -294,7 +294,7 @@ PGTYPEStimestamp_from_asc(char *str, char **endptr)\n \tchar\t   *realptr;\n \tchar\t  **ptr = (endptr != NULL) ? endptr : &realptr;\n \n-\tif (strlen(str) >= sizeof(lowstr))\n+\tif (strlen(str) > MAXDATELEN)\n \t{\n \t\terrno = PGTYPES_TS_BAD_TIMESTAMP;\n \t\treturn (noresult);\ndiff --git a/src/interfaces/ecpg/test/expected/pgtypeslib-dt_test2.c b/src/interfaces/ecpg/test/expected/pgtypeslib-dt_test2.c\nindex d3ebb0e106..0ba1936f1d 100644\n--- a/src/interfaces/ecpg/test/expected/pgtypeslib-dt_test2.c\n+++ b/src/interfaces/ecpg/test/expected/pgtypeslib-dt_test2.c\n@@ -45,6 +45,15 @@ char *dates[] = { \"19990108foobar\",\n \t\t\t\t  \"1999.008\",\n \t\t\t\t  \"J2451187\",\n \t\t\t\t  \"January 8, 99 BC\",\n+\t\t\t\t  /*\n+\t\t\t\t   * Maximize space usage in ParseDateTime() with 25\n+\t\t\t\t   * (MAXDATEFIELDS) fields and 128 (MAXDATELEN) total length.\n+\t\t\t\t   */\n+\t\t\t\t  \"........................Xaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"\n+\t\t\t\t  \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\",\n+\t\t\t\t  /* 26 fields */\n+\t\t\t\t  \".........................aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"\n+\t\t\t\t  \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\",\n \t\t\t\t  NULL };\n \n /* do not conflict with libc \"times\" symbol */\n@@ -52,6 +61,7 @@ static char *times[] = { \"0:04\",\n \t\t\t\t  \"1:59 PDT\",\n \t\t\t\t  \"13:24:40 -8:00\",\n \t\t\t\t  \"13:24:40.495+3\",\n+\t\t\t\t  \"13:24:40.123456789+3\",\n \t\t\t\t  NULL };\n \n char *intervals[] = { \"1 minute\",\n@@ -73,22 +83,22 @@ main(void)\n \t\t \n \t\t \n \t\n-#line 52 \"dt_test2.pgc\"\n+#line 62 \"dt_test2.pgc\"\n  date date1 ;\n  \n-#line 53 \"dt_test2.pgc\"\n+#line 63 \"dt_test2.pgc\"\n  timestamp ts1 , ts2 ;\n  \n-#line 54 \"dt_test2.pgc\"\n+#line 64 \"dt_test2.pgc\"\n  char * text ;\n  \n-#line 55 \"dt_test2.pgc\"\n+#line 65 \"dt_test2.pgc\"\n  interval * i1 ;\n  \n-#line 56 \"dt_test2.pgc\"\n+#line 66 \"dt_test2.pgc\"\n  date * dc ;\n /* exec sql end declare section */\n-#line 57 \"dt_test2.pgc\"\n+#line 67 \"dt_test2.pgc\"\n \n \n \tint i, j;\ndiff --git a/src/interfaces/ecpg/test/expected/pgtypeslib-dt_test2.stdout b/src/interfaces/ecpg/test/expected/pgtypeslib-dt_test2.stdout\nindex 24e9d26dfe..9a4587b498 100644\n--- a/src/interfaces/ecpg/test/expected/pgtypeslib-dt_test2.stdout\n+++ b/src/interfaces/ecpg/test/expected/pgtypeslib-dt_test2.stdout\n@@ -8,85 +8,104 @@ TS[3,0]: 1999-01-08 00:04:00\n TS[3,1]: 1999-01-08 01:59:00\n TS[3,2]: 1999-01-08 13:24:40\n TS[3,3]: 1999-01-08 13:24:40.495\n+TS[3,4]: 1999-01-08 13:24:40.123456\n Date[4]: 1999-01-08 (N - F)\n TS[4,0]: 1999-01-08 00:04:00\n TS[4,1]: 1999-01-08 01:59:00\n TS[4,2]: 1999-01-08 13:24:40\n TS[4,3]: 1999-01-08 13:24:40.495\n+TS[4,4]: 1999-01-08 13:24:40.123456\n Date[5]: 1999-01-08 (N - F)\n TS[5,0]: 1999-01-08 00:04:00\n TS[5,1]: 1999-01-08 01:59:00\n TS[5,2]: 1999-01-08 13:24:40\n TS[5,3]: 1999-01-08 13:24:40.495\n+TS[5,4]: 1999-01-08 13:24:40.123456\n Date[6]: 1999-01-18 (N - F)\n TS[6,0]: 1999-01-18 00:04:00\n TS[6,1]: 1999-01-18 01:59:00\n TS[6,2]: 1999-01-18 13:24:40\n TS[6,3]: 1999-01-18 13:24:40.495\n+TS[6,4]: 1999-01-18 13:24:40.123456\n Date[7]: 2003-01-02 (N - F)\n TS[7,0]: 2003-01-02 00:04:00\n TS[7,1]: 2003-01-02 01:59:00\n TS[7,2]: 2003-01-02 13:24:40\n TS[7,3]: 2003-01-02 13:24:40.495\n+TS[7,4]: 2003-01-02 13:24:40.123456\n Date[8]: 1999-01-08 (N - F)\n TS[8,0]: 1999-01-08 00:04:00\n TS[8,1]: 1999-01-08 01:59:00\n TS[8,2]: 1999-01-08 13:24:40\n TS[8,3]: 1999-01-08 13:24:40.495\n+TS[8,4]: 1999-01-08 13:24:40.123456\n Date[9]: 1999-01-08 (N - F)\n TS[9,0]: 1999-01-08 00:04:00\n TS[9,1]: 1999-01-08 01:59:00\n TS[9,2]: 1999-01-08 13:24:40\n TS[9,3]: 1999-01-08 13:24:40.495\n+TS[9,4]: 1999-01-08 13:24:40.123456\n Date[10]: 1999-01-08 (N - F)\n TS[10,0]: 1999-01-08 00:04:00\n TS[10,1]: 1999-01-08 01:59:00\n TS[10,2]: 1999-01-08 13:24:40\n TS[10,3]: 1999-01-08 13:24:40.495\n+TS[10,4]: 1999-01-08 13:24:40.123456\n Date[11]: 1999-01-08 (N - F)\n TS[11,0]: 1999-01-08 00:04:00\n TS[11,1]: 1999-01-08 01:59:00\n TS[11,2]: 1999-01-08 13:24:40\n TS[11,3]: 1999-01-08 13:24:40.495\n+TS[11,4]: 1999-01-08 13:24:40.123456\n Date[12]: 1999-01-08 (N - F)\n TS[12,0]: 1999-01-08 00:04:00\n TS[12,1]: 1999-01-08 01:59:00\n TS[12,2]: 1999-01-08 13:24:40\n TS[12,3]: 1999-01-08 13:24:40.495\n+TS[12,4]: 1999-01-08 13:24:40.123456\n Date[13]: 2006-01-08 (N - F)\n TS[13,0]: 2006-01-08 00:04:00\n TS[13,1]: 2006-01-08 01:59:00\n TS[13,2]: 2006-01-08 13:24:40\n TS[13,3]: 2006-01-08 13:24:40.495\n+TS[13,4]: 2006-01-08 13:24:40.123456\n Date[14]: 1999-01-08 (N - F)\n TS[14,0]: 1999-01-08 00:04:00\n TS[14,1]: 1999-01-08 01:59:00\n TS[14,2]: 1999-01-08 13:24:40\n TS[14,3]: 1999-01-08 13:24:40.495\n+TS[14,4]: 1999-01-08 13:24:40.123456\n Date[15]: 1999-01-08 (N - F)\n TS[15,0]: 1999-01-08 00:04:00\n TS[15,1]: 1999-01-08 01:59:00\n TS[15,2]: 1999-01-08 13:24:40\n TS[15,3]: 1999-01-08 13:24:40.495\n+TS[15,4]: 1999-01-08 13:24:40.123456\n Date[16]: 1999-01-08 (N - F)\n TS[16,0]: 1999-01-08 00:04:00\n TS[16,1]: 1999-01-08 01:59:00\n TS[16,2]: 1999-01-08 13:24:40\n TS[16,3]: 1999-01-08 13:24:40.495\n+TS[16,4]: 1999-01-08 13:24:40.123456\n Date[17]: 1999-01-08 (N - F)\n TS[17,0]: 1999-01-08 00:04:00\n TS[17,1]: 1999-01-08 01:59:00\n TS[17,2]: 1999-01-08 13:24:40\n TS[17,3]: 1999-01-08 13:24:40.495\n+TS[17,4]: 1999-01-08 13:24:40.123456\n Date[18]: 1999-01-08 (N - F)\n TS[18,0]: 1999-01-08 00:04:00\n TS[18,1]: 1999-01-08 01:59:00\n TS[18,2]: 1999-01-08 13:24:40\n TS[18,3]: 1999-01-08 13:24:40.495\n+TS[18,4]: 1999-01-08 13:24:40.123456\n Date[19]: 0099-01-08 BC (N - F)\n TS[19,0]: 0099-01-08 00:04:00 BC\n TS[19,1]: 0099-01-08 01:59:00 BC\n TS[19,2]: 0099-01-08 13:24:40 BC\n+TS[19,4]: 0099-01-08 13:24:40.123456 BC\n+Date[20]: - (N - T)\n+Date[21]: - (N - T)\n interval[0]: @ 1 min\n interval_copy[0]: @ 1 min\n interval[1]: @ 1 day 12 hours 59 mins 10 secs\ndiff --git a/src/interfaces/ecpg/test/pgtypeslib/dt_test2.pgc b/src/interfaces/ecpg/test/pgtypeslib/dt_test2.pgc\nindex 0edf012fd1..a127dd93a6 100644\n--- a/src/interfaces/ecpg/test/pgtypeslib/dt_test2.pgc\n+++ b/src/interfaces/ecpg/test/pgtypeslib/dt_test2.pgc\n@@ -27,6 +27,15 @@ char *dates[] = { \"19990108foobar\",\n \t\t\t\t  \"1999.008\",\n \t\t\t\t  \"J2451187\",\n \t\t\t\t  \"January 8, 99 BC\",\n+\t\t\t\t  /*\n+\t\t\t\t   * Maximize space usage in ParseDateTime() with 25\n+\t\t\t\t   * (MAXDATEFIELDS) fields and 128 (MAXDATELEN) total length.\n+\t\t\t\t   */\n+\t\t\t\t  \"........................Xaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"\n+\t\t\t\t  \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\",\n+\t\t\t\t  /* 26 fields */\n+\t\t\t\t  \".........................aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"\n+\t\t\t\t  \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\",\n \t\t\t\t  NULL };\n \n /* do not conflict with libc \"times\" symbol */\n@@ -34,6 +43,7 @@ static char *times[] = { \"0:04\",\n \t\t\t\t  \"1:59 PDT\",\n \t\t\t\t  \"13:24:40 -8:00\",\n \t\t\t\t  \"13:24:40.495+3\",\n+\t\t\t\t  \"13:24:40.123456789+3\",\n \t\t\t\t  NULL };\n \n char *intervals[] = { \"1 minute\",\ndiff --git a/src/test/regress/expected/interval.out b/src/test/regress/expected/interval.out\nindex 3bf221187b..99fd0ca490 100644\n--- a/src/test/regress/expected/interval.out\n+++ b/src/test/regress/expected/interval.out\n@@ -306,6 +306,13 @@ select '4 millenniums 5 centuries 4 decades 1 year 4 months 4 days 17 minutes 31\n  @ 4541 years 4 mons 4 days 17 mins 31 secs\n (1 row)\n \n+-- test long interval output\n+select '100000000y 10mon -1000000000d -1000000000h -10min -10.000001s ago'::interval;\n+                                         interval                                          \n+-------------------------------------------------------------------------------------------\n+ @ 100000000 years 10 mons -1000000000 days -1000000000 hours -10 mins -10.000001 secs ago\n+(1 row)\n+\n -- test justify_hours() and justify_days()\n SELECT justify_hours(interval '6 months 3 days 52 hours 3 minutes 2 seconds') as \"6 mons 5 days 4 hours 3 mins 2 seconds\";\n  6 mons 5 days 4 hours 3 mins 2 seconds \ndiff --git a/src/test/regress/sql/interval.sql b/src/test/regress/sql/interval.sql\nindex f1da4c2911..7cee2864de 100644\n--- a/src/test/regress/sql/interval.sql\n+++ b/src/test/regress/sql/interval.sql\n@@ -108,6 +108,8 @@ select avg(f1) from interval_tbl;\n -- test long interval input\n select '4 millenniums 5 centuries 4 decades 1 year 4 months 4 days 17 minutes 31 seconds'::interval;\n \n+-- test long interval output\n+select '100000000y 10mon -1000000000d -1000000000h -10min -10.000001s ago'::interval;\n \n -- test justify_hours() and justify_days()\n \n-- \n2.30.2\n",
+        "Many server functions use the MAXDATELEN constant to size a buffer for\nparsing or displaying a datetime value.  It was much too small for the\nlongest possible interval output and slightly too small for certain\nvalid timestamp input, particularly input with a long timezone name.\nThe long input was rejected needlessly; the long output caused\ninterval_out() to overrun its buffer.  ECPG's pgtypes library has a copy\nof the vulnerable functions, which bore the same vulnerabilities along\nwith some of its own.  In contrast to the server, certain long inputs\ncaused stack overflow rather than failing cleanly.  Back-patch to 8.4\n(all supported versions).\n\nReported by Daniel SchAssler, reviewed by Tom Lane.\n\nSecurity: CVE-2014-0063"
+    ],
+    [
+        "diff --git a/src/lxc/lxclock.c b/src/lxc/lxclock.c\nindex fe13898df9..e9e95f7a01 100644\n--- a/src/lxc/lxclock.c\n+++ b/src/lxc/lxclock.c\n@@ -103,13 +103,13 @@ static char *lxclock_name(const char *p, const char *n)\n \tchar *rundir;\n \n \t/* lockfile will be:\n-\t * \"/run\" + \"/lock/lxc/$lxcpath/$lxcname + '\\0' if root\n+\t * \"/run\" + \"/lxc/lock/$lxcpath/$lxcname + '\\0' if root\n \t * or\n-\t * $XDG_RUNTIME_DIR + \"/lock/lxc/$lxcpath/$lxcname + '\\0' if non-root\n+\t * $XDG_RUNTIME_DIR + \"/lxc/lock/$lxcpath/$lxcname + '\\0' if non-root\n \t */\n \n-\t/* length of \"/lock/lxc/\" + $lxcpath + \"/\" + \".\" + $lxcname + '\\0' */\n-\tlen = strlen(\"/lock/lxc/\") + strlen(n) + strlen(p) + 3;\n+\t/* length of \"/lxc/lock/\" + $lxcpath + \"/\" + \".\" + $lxcname + '\\0' */\n+\tlen = strlen(\"/lxc/lock/\") + strlen(n) + strlen(p) + 3;\n \trundir = get_rundir();\n \tif (!rundir)\n \t\treturn NULL;\n@@ -120,7 +120,7 @@ static char *lxclock_name(const char *p, const char *n)\n \t\treturn NULL;\n \t}\n \n-\tret = snprintf(dest, len, \"%s/lock/lxc/%s\", rundir, p);\n+\tret = snprintf(dest, len, \"%s/lxc/lock/%s\", rundir, p);\n \tif (ret < 0 || ret >= len) {\n \t\tfree(dest);\n \t\tfree(rundir);\n@@ -128,40 +128,13 @@ static char *lxclock_name(const char *p, const char *n)\n \t}\n \tret = mkdir_p(dest, 0755);\n \tif (ret < 0) {\n-\t\t/* fall back to \"/tmp/\" + $(id -u) + \"/lxc\" + $lxcpath + \"/\" + \".\" + $lxcname + '\\0'\n-\t\t * * maximum length of $(id -u) is 10 calculated by (log (2 ** (sizeof(uid_t) * 8) - 1) / log 10 + 1)\n-\t\t * * lxcpath always starts with '/'\n-\t\t */\n-\t\tint l2 = 22 + strlen(n) + strlen(p);\n-\t\tif (l2 > len) {\n-\t\t\tchar *d;\n-\t\t\td = realloc(dest, l2);\n-\t\t\tif (!d) {\n-\t\t\t\tfree(dest);\n-\t\t\t\tfree(rundir);\n-\t\t\t\treturn NULL;\n-\t\t\t}\n-\t\t\tlen = l2;\n-\t\t\tdest = d;\n-\t\t}\n-\t\tret = snprintf(dest, len, \"/tmp/%d/lxc%s\", geteuid(), p);\n-\t\tif (ret < 0 || ret >= len) {\n-\t\t\tfree(dest);\n-\t\t\tfree(rundir);\n-\t\t\treturn NULL;\n-\t\t}\n-\t\tret = mkdir_p(dest, 0755);\n-\t\tif (ret < 0) {\n-\t\t\tfree(dest);\n-\t\t\tfree(rundir);\n-\t\t\treturn NULL;\n-\t\t}\n-\t\tret = snprintf(dest, len, \"/tmp/%d/lxc%s/.%s\", geteuid(), p, n);\n-\t} else\n-\t\tret = snprintf(dest, len, \"%s/lock/lxc/%s/.%s\", rundir, p, n);\n+\t\tfree(dest);\n+\t\tfree(rundir);\n+\t\treturn NULL;\n+\t}\n \n+\tret = snprintf(dest, len, \"%s/lxc/lock/%s/.%s\", rundir, p, n);\n \tfree(rundir);\n-\n \tif (ret < 0 || ret >= len) {\n \t\tfree(dest);\n \t\treturn NULL;\ndiff --git a/src/tests/locktests.c b/src/tests/locktests.c\nindex dd3393a893..233ca127c6 100644\n--- a/src/tests/locktests.c\n+++ b/src/tests/locktests.c\n@@ -122,7 +122,7 @@ int main(int argc, char *argv[])\n \t\texit(1);\n \t}\n \tstruct stat sb;\n-\tchar *pathname = RUNTIME_PATH \"/lock/lxc/var/lib/lxc/\";\n+\tchar *pathname = RUNTIME_PATH \"/lxc/lock/var/lib/lxc/\";\n \tret = stat(pathname, &sb);\n \tif (ret != 0) {\n \t\tfprintf(stderr, \"%d: filename %s not created\\n\", __LINE__,\n",
+        "lxclock: use /run/lxc/lock rather than /run/lock/lxc\nThis prevents an unprivileged user to use LXC to create arbitrary file\non the filesystem.\n\nSigned-off-by: Serge Hallyn <[email protected]>\nSigned-off-by: Tyler Hicks <[email protected]>\nAcked-by: Stephane Graber <[email protected]>"
+    ]
+]

backend/inference.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from backend.section_infer_helper.local_llm_helper import local_llm_helper
+from backend.section_infer_helper.online_llm_helper import online_llm_helper
+from backend.cwe_infer_helper import cwe_infer_helper
+MODEL_TYPE_HELPER_MAP = {
+    "Local LLM": local_llm_helper,
+    "Online LLM": online_llm_helper
+}
+PREDEF_MODEL_MAP = {
+    "Local LLM": local_llm_helper.PREDEF_MODEL,
+    "Online LLM": online_llm_helper.PREDEF_MODEL
+}
+LOCAL_MODEL_PEFT_MAP = local_llm_helper.MODEL_PEFT_MAP
+PREDEF_CWE_MODEL = cwe_infer_helper.PREDEF_MODEL
+def section_infer(diff_code, patch_message, model_type, *model_config):
+    helper = MODEL_TYPE_HELPER_MAP.get(model_type, None)
+    if helper is None:
+        raise ValueError(f"Model {model_type} is not supported")
+    helper.load_model(*model_config)
+    results = helper.infer(diff_code, patch_message)
+    return results
+def cwe_infer(diff_code, patch_message, *model_config):
+    helper = cwe_infer_helper
+    helper.load_model(*model_config)
+    results = helper.infer(diff_code, patch_message)
+    return results

backend/model/PEFT/patchouli-qwc2.5-0.5b/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-Coder-0.5B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "o_proj",
+    "v_proj",
+    "up_proj",
+    "q_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

backend/model/PEFT/patchouli-qwc2.5-0.5b/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7b89f36b9e9ab81e75f2b2f0156e771254f09e6a2308b6c90cad9af7d93b02a
+size 8841928

backend/model/PEFT/patchouli-qwc2.5-0.5b/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

backend/model/PEFT/patchouli-qwc2.5-0.5b/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

backend/model/PEFT/patchouli-qwc2.5-0.5b/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

backend/model/PEFT/patchouli-qwc2.5-0.5b/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

backend/model/PEFT/patchouli-qwc2.5-0.5b/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

backend/model/PEFT/patchouli-qwc2.5-0.5b/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

backend/model/cwe-cls/patchouli-unixcoder/config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "_name_or_path": "microsoft/unixcoder-base-nine",
+  "architectures": [
+    "RobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "CWE-79",
+    "1": "CWE-787",
+    "2": "CWE-89",
+    "3": "CWE-352",
+    "4": "CWE-22",
+    "5": "CWE-125",
+    "6": "CWE-78",
+    "7": "CWE-416",
+    "8": "CWE-862",
+    "9": "CWE-434",
+    "10": "CWE-94",
+    "11": "CWE-20",
+    "12": "CWE-77",
+    "13": "CWE-287",
+    "14": "CWE-269",
+    "15": "CWE-502",
+    "16": "CWE-200",
+    "17": "CWE-863",
+    "18": "CWE-918",
+    "19": "CWE-119",
+    "20": "CWE-476",
+    "21": "CWE-798",
+    "22": "CWE-190",
+    "23": "CWE-400",
+    "24": "CWE-306"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 1026,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.47.1",
+  "type_vocab_size": 10,
+  "use_cache": true,
+  "vocab_size": 51416
+}

backend/model/cwe-cls/patchouli-unixcoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63620e0ba21c9d79f01b0ca6aa1c32c09119da66e3e5fed7f6481e1e61a0d103
+size 503819956

backend/section_infer_helper/base_helper.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from backend.utils.data_process import LANGUAGE_EXT_MAP
+from abc import ABC, abstractmethod
+class BaseHelper(ABC):
+    def _get_lang_ext(language_list):
+        ext_list = []
+        for lang in language_list:
+            ext_list.extend(LANGUAGE_EXT_MAP.get(lang, []))
+        return ext_list
+    def _get_lang_by_ext(ext):
+        for lang, ext_list in LANGUAGE_EXT_MAP.items():
+            if ext in ext_list:
+                return lang
+        return None
+    class InputData():
+        def __init__(self, filename, patch, section, patch_msg):
+            self.filename = filename
+            self.patch = patch
+            self.section = section
+            self.patch_msg = patch_msg
+    @abstractmethod
+    def load_model(self, *args, **kwargs):
+        raise NotImplementedError()
+    @abstractmethod
+    def infer(self, diff_code, message = None, batch_size = 1):
+        '''
+        Result format:
+        [
+            file: [
+                {
+                    "section": section,
+                    "predict": 1/0,
+                    "conf": conf
+                },
+                ...
+            ],
+            ...
+        ]
+        '''
+        raise NotImplementedError()

backend/section_infer_helper/local_llm_helper.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import logging
+import torch
+from torch.nn.functional import softmax
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft import PeftModel
+from tqdm import tqdm
+from collections import defaultdict
+from backend.section_infer_helper.base_helper import BaseHelper
+from backend.utils.data_process import split_to_file_diff, split_to_section
+logger = logging.getLogger(__name__)
+class LocalLLMHelper(BaseHelper):
+    MAX_LENGTH = 4096
+    MAX_NEW_TOKEN = 16
+    BATCH_SIZE = 4
+    SYSTEM_PROMPT = "You are now an expert in code vulnerability and patch fixes."
+    def generate_instruction(language, file_name, patch, section, message = None):
+        instruction =  "[TASK]\nHere is a patch in {} language and a section of this patch for a source code file with path {}. Determine if the patch section fixes any software vulnerabilities. Output 'yes' or 'no' and do not output any other text.\n".format(language, file_name)
+        instruction += "[Patch]\n{}\n".format(patch)
+        instruction += "[A section of this patch]\n{}\n".format(section)
+        if message is not None and message != "":
+            instruction += "[Message of the Patch]\n{}\n".format(message)
+        return instruction
+    MODEL_CONFIGS = defaultdict(lambda: {
+        "supported_languages": ["C", "C++", "Java", "Python"],
+    })
+    MODEL_CONFIGS.update({
+        ("Qwen/Qwen2.5-Coder-0.5B-Instruct", "backend/model/PEFT/patchouli-qwc2.5-0.5b"): {
+            "supported_languages": ["C", "C++", "Java", "Python"],
+        },
+        ("Qwen/Qwen2.5-Coder-0.5B-Instruct", None): {
+            "supported_languages": ["C", "C++", "Java", "Python"],
+        },
+        ("Qwen/Qwen2.5-Coder-7B-Instruct", None): {
+            "supported_languages": ["C", "C++", "Java", "Python"],
+        },
+        ("deepseek-ai/deepseek-coder-7b-instruct-v1.5", None): {
+            "supported_languages": ["C", "C++", "Java", "Python"],
+        },
+        ("codellama/CodeLlama-7b-Instruct-hf", None): {
+            "supported_languages": ["C", "C++", "Java", "Python"],
+        },
+    })
+    PREDEF_MODEL = []
+    for model, peft in MODEL_CONFIGS.keys():
+        if model not in PREDEF_MODEL:
+            PREDEF_MODEL.append(model)
+    MODEL_PEFT_MAP = defaultdict(lambda: [None])
+    for model, peft in MODEL_CONFIGS.keys():
+        if peft is not None:
+            MODEL_PEFT_MAP[model].append(peft)
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.model_name_or_path = None
+        self.peft_name_or_path = None
+    def __del__(self):
+        if self.model is not None:
+            self.release_model()
+    def infer(self, diff_code, message = None, batch_size = BATCH_SIZE):
+        if self.model is None:
+            raise RuntimeError("Model is not loaded")
+        results = {}
+        input_list = []
+        file_diff_list = split_to_file_diff(diff_code, BaseHelper._get_lang_ext(LocalLLMHelper.MODEL_CONFIGS[self.model_name_or_path]["supported_languages"]))
+        for file_a, _, file_diff in file_diff_list:
+            sections = split_to_section(file_diff)
+            file_name = file_a.removeprefix("a/")
+            results[file_name] = []
+            for section in sections:
+                input_list.append(BaseHelper.InputData(file_name, section, section, message))
+        input_prompt, output_text, output_prob = self.do_infer(input_list, batch_size)
+        assert len(input_list) == len(input_prompt) == len(output_text) == len(output_prob)
+        for i in range(len(input_list)):
+            file_name = input_list[i].filename
+            section = input_list[i].section
+            output_text_i = output_text[i].lower()
+            output_prob_i = output_prob[i]
+            results[file_name].append({
+                "section": section,
+                "predict": 1 if "yes" in output_text_i else 0,
+                "conf": output_prob_i
+            })
+        return results
+    def load_model(self, model_name_or_path, peft_name_or_path = None):
+        if model_name_or_path == self.model_name_or_path and peft_name_or_path == self.peft_name_or_path:
+            return
+        logger.info(f"Loading model {model_name_or_path}")
+        if self.model is not None:
+            self.release_model()
+        self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype=torch.float32, device_map="auto")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side="left")
+        if peft_name_or_path is not None and peft_name_or_path != "" and peft_name_or_path != "None":
+            logger.info(f"Loading PEFT model {peft_name_or_path}")
+            self.model = PeftModel.from_pretrained(self.model, peft_name_or_path)
+            self.tokenizer = AutoTokenizer.from_pretrained(peft_name_or_path, padding_side="left")
+        if self.tokenizer.pad_token_id is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.model.eval()
+        self.model_name_or_path = model_name_or_path
+        self.peft_name_or_path = peft_name_or_path
+        logger.info(f"Model loaded")
+    def generate_message(filename, patch, section, patch_message = None):
+        ext = filename.split(".")[-1]
+        language = BaseHelper._get_lang_by_ext(ext)
+        messages = [
+            {
+                "role": "system",
+                "content": LocalLLMHelper.SYSTEM_PROMPT
+            },
+            {
+                "role": "user",
+                "content": LocalLLMHelper.generate_instruction(language, filename, patch, section, patch_message)
+            }
+        ]
+        return messages
+    def release_model(self):
+        del self.model
+        del self.tokenizer
+        self.model = None
+        self.tokenizer = None
+        torch.cuda.empty_cache()
+        logger.info(f"Model {self.model_name_or_path} released")
+        self.model_name_or_path = None
+    def do_infer(self, input_list, batch_size = BATCH_SIZE):
+        if type(input_list) is not list:
+            input_list = [input_list]
+        input_data_batches = [input_list[i:i+batch_size] for i in range(0, len(input_list), batch_size)]
+        input_ids_list = []
+        if len(input_list) > 0:
+            logger.info("Example input prompt")
+            logger.info(LocalLLMHelper.generate_message(input_list[0].filename, input_list[0].patch, input_list[0].section, input_list[0].patch_msg))
+        for batch in tqdm(input_data_batches, desc="Tokenizing", unit="batch", total=len(input_data_batches)):
+            message_list = []
+            for input_data in batch:
+                message_list.append(LocalLLMHelper.generate_message(input_data.filename, input_data.patch, input_data.section, input_data.patch_msg))
+            input_ids_batch = self.tokenizer.apply_chat_template(
+                message_list,
+                add_generation_prompt=True,
+                return_tensors="pt",
+                max_length=LocalLLMHelper.MAX_LENGTH,
+                truncation=True,
+                padding=True)
+            input_ids_list.append(input_ids_batch)
+        input_prompt = []
+        output_text = []
+        output_prob = []
+        for input_ids in tqdm(input_ids_list, desc="Generating", unit="batch", total=len(input_ids_list)):
+            input_ids = input_ids.to(self.model.device)
+            outputs = self.model.generate(input_ids, max_new_tokens=LocalLLMHelper.MAX_NEW_TOKEN,
+                                    eos_token_id=self.tokenizer.eos_token_id, pad_token_id=self.tokenizer.pad_token_id,
+                                    output_logits=True, return_dict_in_generate=True)
+            input_prompt.extend(self.tokenizer.batch_decode(input_ids, skip_special_tokens=True))
+            output_text.extend(self.tokenizer.batch_decode(outputs.sequences[:, len(input_ids[0]):], skip_special_tokens=True))
+            batch_output_prob = softmax(outputs.logits[0], dim=-1).max(dim=-1).values
+            output_prob.extend([float(p) for p in batch_output_prob])
+        return input_prompt, output_text, output_prob
+local_llm_helper = LocalLLMHelper()

backend/section_infer_helper/online_llm_helper.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import logging
+from openai import OpenAI
+from tqdm import tqdm
+from collections import defaultdict
+import traceback
+import httpx
+from backend.utils.data_process import split_to_file_diff, split_to_section
+from backend.section_infer_helper.base_helper import BaseHelper
+logger = logging.getLogger(__name__)
+class OnlineLLMHelper(BaseHelper):
+    MAX_LENGTH = 4096
+    MAX_NEW_TOKENS = 16
+    PREDEF_MODEL = ["gpt-3.5-turbo", "deepseek-chat", "qwen-coder-plus", "gpt-4-turbo", "gpt-4o", "gemini-1.5-pro-latest", "claude-3-5-sonnet-20241022"]
+    MODEL_CONFIGS = defaultdict(lambda: {
+        "supported_languages": ["C", "C++", "Java", "Python"],
+    })
+    SYSTEM_PROMPT = "You are an expert in code vulnerability and patch fixes."
+    def generate_instruction(language, file_name, patch, section, message = None):
+        instruction =  "[TASK]\nHere is a patch in {} language and a section of this patch for a source code file with path {}. Determine if the patch section fixes any software vulnerabilities. Output 'yes' or 'no' and do not output any other text.\n".format(language, file_name)
+        instruction += "[Patch]\n{}\n".format(patch)
+        instruction += "[A section of this patch]\n{}\n".format(section)
+        if message is not None and message != "":
+            instruction += "[Message of the Patch]\n{}\n".format(message)
+        return instruction
+    def __init__(self):
+        self.model_name = None
+        self.url = None
+        self.key = None
+    def generate_message(filename, patch, section, patch_message = None):
+        ext = filename.split(".")[-1]
+        language = BaseHelper._get_lang_by_ext(ext)
+        user_message = OnlineLLMHelper.generate_instruction(language, filename, patch, section, patch_message)
+        user_message = user_message.split(" ")
+        user_message = user_message[:OnlineLLMHelper.MAX_LENGTH]
+        user_message = " ".join(user_message)
+        messages = [
+            {
+                "role": "system",
+                "content": OnlineLLMHelper.SYSTEM_PROMPT
+            },
+            {
+                "role": "user",
+                "content": user_message
+            }
+        ]
+        return messages
+    def load_model(self, model_name, url, api_key):
+        self.model_name = model_name
+        self.openai_client = OpenAI(
+            base_url = url,
+            api_key = api_key,
+            timeout=httpx.Timeout(15.0)
+        )
+    def infer(self, diff_code, message = None, batch_size=1):
+        if self.model_name is None:
+            raise RuntimeError("Model is not loaded")
+        results = {}
+        input_list = []
+        file_diff_list = split_to_file_diff(diff_code, BaseHelper._get_lang_ext(OnlineLLMHelper.MODEL_CONFIGS[self.model_name]["supported_languages"]))
+        for file_a, _, file_diff in file_diff_list:
+            sections = split_to_section(file_diff)
+            file_name = file_a.removeprefix("a/")
+            results[file_name] = []
+            for section in sections:
+                input_list.append(BaseHelper.InputData(file_name, section, section, message))
+        input_prompt, output_text, output_prob = self.do_infer(input_list, batch_size)
+        assert len(input_list) == len(input_prompt) == len(output_text) == len(output_prob)
+        for i in range(len(input_list)):
+            file_name = input_list[i].filename
+            section = input_list[i].section
+            output_text_i = output_text[i].lower()
+            output_prob_i = output_prob[i]
+            results[file_name].append({
+                "section": section,
+                "predict": -1 if output_text_i == "error" else 1 if "yes" in output_text_i else 0,
+                "conf": output_prob_i
+            })
+        return results
+    def do_infer(self, input_list, batch_size = 1):
+        input_prompt = []
+        for input_data in input_list:
+            input_prompt.append(OnlineLLMHelper.generate_message(input_data.filename, input_data.patch, input_data.section, input_data.patch_msg))
+        if len(input_prompt) > 0:
+            logger.info("Example input prompt: %s", input_prompt[0])
+        output_text = []
+        for prompt, input_data in tqdm(zip(input_prompt, input_list), desc="Inferencing", unit = "section", total = len(input_prompt)):
+            try:
+                response = self.openai_client.chat.completions.create(
+                    messages = prompt,
+                    model = self.model_name,
+                    max_completion_tokens = OnlineLLMHelper.MAX_NEW_TOKENS
+                )
+                output_text.append(response.choices[0].message.content)
+            except KeyboardInterrupt:
+                logging.error("KeyboardInterrupted")
+                break
+            except Exception as e:
+                logger.error(f"Error: {e}")
+                logger.error(f"Error inferencing: {input_data.filename} - {input_data.section}")
+                logger.error(traceback.format_exc())
+                output_text.append("error")
+                continue
+                # break
+        output_prob = [1.0] * len(output_text)
+        return input_prompt, output_text, output_prob
+online_llm_helper = OnlineLLMHelper()

backend/section_infer_helper/random_helper.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# DEBUG ONLY
+import time
+import random
+from tqdm import tqdm
+from backend.section_infer_helper.base_helper import BaseHelper
+from backend.utils.data_process import split_to_file_diff, split_to_section
+class RandomHelper(BaseHelper):
+    PREDEF_MODEL = ["Random"]
+    MODELS_SUPPORTED_LANGUAGES = {
+        "Random": ["C", "C++", "Java", "Python"]
+    }
+    def load_model(self, model_name):
+        pass
+    def infer(self, diff_code):
+        file_diff_list = split_to_file_diff(diff_code, BaseHelper._get_lang_ext(self.MODELS_SUPPORTED_LANGUAGES["Random"]))
+        results = {}
+        for file_a, _, file_diff in tqdm(file_diff_list, desc="Inferencing", unit="file", total=len(file_diff_list)):
+            time.sleep(0.1)
+            sections = split_to_section(file_diff)
+            file = file_a.removeprefix("a/")
+            results[file] = []
+            for section in sections:
+                results[file].append({
+                    "section": section,
+                    "predict": random.choice([0, 1]),
+                    "conf": random.random()
+                })
+        return results
+random_helper = RandomHelper()

backend/utils/data_process.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import os, re
+LANGUAGE_EXT_MAP = {
+    "C": ['c', 'h', 'cc'],
+    "C++": ['cpp', 'hpp', 'cxx', 'hxx', 'c++', 'h++'],
+    "Java": ['java'],
+    "Python": ['py', 'pyx']
+}
+# 拆分一个commit下的diff，拆成多个文件的变更
+def split_to_file_diff(diff_code : str, preserve_ext_list : list = []) -> list:
+    # 定义正则表达式匹配 diff --git 行，并提取文件路径
+    pattern = re.compile(r"diff --git (\S+) (\S+)")  # 匹配 diff --git 行
+    files = []
+    current_diff_content = []
+    current_file_a, current_file_b = None, None
+    preserve_ext_list = [f".{ext}" for ext in preserve_ext_list]
+    # 遍历每行 diff 数据
+    for line in diff_code.splitlines():
+        match = pattern.match(line)
+        if match:
+            # 如果是新的 diff --git 行，处理前一个 diff
+            if current_file_a and current_file_b:
+                # 获取文件扩展名
+                ext_a = os.path.splitext(current_file_a)[1]
+                ext_b = os.path.splitext(current_file_b)[1]
+                # 只保留指定扩展名的文件
+                if len(preserve_ext_list) == 0 or ext_a in preserve_ext_list and ext_b in preserve_ext_list:
+                    files.append((current_file_a, current_file_b, '\n'.join(current_diff_content)))
+            # 更新当前文件路径
+            current_file_a = match.group(1)
+            current_file_b = match.group(2)
+            current_diff_content = [line]  # 重置当前的 diff 内容，包含当前行
+        else:
+            if current_file_a and current_file_b:
+                current_diff_content.append(line)
+    # 处理最后一个 diff
+    if current_file_a and current_file_b:
+        ext_a = os.path.splitext(current_file_a)[1]
+        ext_b = os.path.splitext(current_file_b)[1]
+        if len(preserve_ext_list) == 0 or ext_a in preserve_ext_list and ext_b in preserve_ext_list:
+            files.append((current_file_a, current_file_b, '\n'.join(current_diff_content)))
+    return files
+# 拆分一个change为多个变更点
+def split_to_section(file_diff : str) -> list:
+    # 使用正则表达式匹配@@区块和其中的变更内容
+    # 正则匹配格式：以 @@ 开始，后接变更内容
+    pattern = re.compile(r"@@.*?@@(\r?\n?)([\s\S]*?)(?=@@|\Z)", re.MULTILINE)
+    change_blocks = []
+    # 匹配所有变更区块
+    for match in pattern.finditer(file_diff):
+        # 获取变更内容
+        block = match.group(0)
+        # 按行拆分变更内容
+        change_blocks.append(block)
+    return change_blocks
+if __name__ == "__main__":
+# 测试用例
+    diff = \
+"""diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
+index 71ba18efa15b..867664918715 100644
+--- a/drivers/net/bonding/bond_main.c
++++ b/drivers/net/bonding/bond_main.c
+@@ -1543,9 +1543,11 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
+     bond_set_carrier(bond);
+     if (USES_PRIMARY(bond->params.mode)) {
++        block_netpoll_tx();
+         write_lock_bh(&bond->curr_slave_lock);
+         bond_select_active_slave(bond);
+         write_unlock_bh(&bond->curr_slave_lock);
++        unblock_netpoll_tx();
+     }
+     pr_info("%s: enslaving %s as a%s interface with a%s link.\n",
+@@ -1571,10 +1573,12 @@ err_detach:
+     if (bond->primary_slave == new_slave)
+         bond->primary_slave = NULL;
+     if (bond->curr_active_slave == new_slave) {
++        block_netpoll_tx();
+         write_lock_bh(&bond->curr_slave_lock);
+         bond_change_active_slave(bond, NULL);
+         bond_select_active_slave(bond);
+         write_unlock_bh(&bond->curr_slave_lock);
++        unblock_netpoll_tx();
+     }
+     slave_disable_netpoll(new_slave);
+@@ -2864,9 +2868,12 @@ static int bond_slave_netdev_event(unsigned long event,
+         pr_info("%s: Primary slave changed to %s, reselecting active slave.\
+",
+             bond->dev->name, bond->primary_slave ? slave_dev->name :
+                                    "none");
++
++        block_netpoll_tx();
+         write_lock_bh(&bond->curr_slave_lock);
+         bond_select_active_slave(bond);
+         write_unlock_bh(&bond->curr_slave_lock);
++        unblock_netpoll_tx();
+         break;
+     case NETDEV_FEAT_CHANGE:
+         bond_compute_features(bond);
+"""
+    # 提取所有变更块
+    changes = split_to_file_diff(diff, ['c'])
+    for file_a, file_b, diff_content in changes:
+        print(f"a: {file_a}, b: {file_b}")
+        print(diff_content)
+        print("=" * 50)
+    change_blocks = split_to_section(changes[0][2])
+    for idx, block in enumerate(change_blocks):
+        print(f"Change Block {idx + 1}:")
+        print(block)
+        print("-" * 50)

dataset_eval.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import os
+import jsonlines
+import argparse
+from tqdm import tqdm
+import logging
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, matthews_corrcoef
+from backend.section_infer_helper.local_llm_helper import local_llm_helper
+from backend.section_infer_helper.online_llm_helper import online_llm_helper
+INCLUDE_MSG = "no"
+BATCH_SIZE = 4
+# overwrite by environment variables
+INCLUDE_MSG = os.environ.get("INCLUDE_MSG", INCLUDE_MSG)
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+def main(args):
+    if args.type == "local":
+        helper = local_llm_helper
+        helper.load_model(args.model, args.peft)
+    elif args.type == "online":
+        helper = online_llm_helper
+        helper.load_model(args.model, args.url, args.key)
+    labels = []
+    predicts = []
+    input_prompts = []
+    output_text = []
+    output_probs = []
+    inputs = []
+    with jsonlines.open(args.data, "r") as reader:
+        test_data = list(reader)
+    finished_item = []
+    if os.path.exists(args.output):
+        with jsonlines.open(args.output, "r") as reader:
+            for i, item in enumerate(reader):
+                finished_item.append((item["commit_id"], item["file_name"]))
+                test_data[i] = item
+                for section in item["sections"]:
+                    labels.append(section["related"])
+                    predicts.append(section["predict"])
+                    input_prompts.append(section["input_prompt"])
+                    output_text.append(section["output_text"])
+                    output_probs.append(section["conf"])
+    for item in test_data:
+        file_name = item["file_name"]
+        patch = item["patch"]
+        if (item["commit_id"],item["file_name"]) in finished_item:
+            print(f"Skip {item['commit_id']}, {item['file_name']}")
+            continue
+        commit_message = item["commit_message"] if INCLUDE_MSG == "yes" else ""
+        for section in item["sections"]:
+            section_content = section["section"]
+            inputs.append(helper.InputData(file_name, patch, section_content, commit_message))
+            labels.append(section["related"])
+    assert len(labels) == 4088, f"Get {len(labels)} labels"
+    try:
+        this_input_prompts, this_output_text, this_output_probs = helper.do_infer(inputs, BATCH_SIZE)
+    except Exception as e:
+        print(f"Error: {e}")
+    input_prompts.extend(this_input_prompts)
+    output_text.extend(this_output_text)
+    output_probs.extend(this_output_probs)
+    for result in output_text:
+        predicts.append("yes" in result.lower())
+    # accuracy = accuracy_score(labels, predicts)
+    # precision = precision_score(labels, predicts)
+    # recall = recall_score(labels, predicts)
+    # f1 = f1_score(labels, predicts)
+    # mcc = matthews_corrcoef(labels, predicts)
+    # tp, fp, fn, tn = confusion_matrix(labels, predicts).ravel()
+    # fpr = fp / (fp + tn + 1e-8)
+    # print("=" * 20)
+    # print(f"Accuracy: {accuracy}")
+    # print(f"Precision: {precision}")
+    # print(f"Recall: {recall}")
+    # print(f"F1: {f1}")
+    # print(f"MCC: {mcc}")
+    # print(f"FPR: {fpr}")
+    # print("=" * 20)
+    with jsonlines.open(args.output, "w") as writer:
+        for item in test_data:
+            if len(output_text) < len(item["sections"]):
+                logging.info("Not enough output")
+                break
+            for section in item["sections"]:
+                section["input_prompt"] = input_prompts.pop(0)
+                section["output_text"] = output_text.pop(0)
+                section["predict"] = True if predicts.pop(0) else False
+                section["conf"] = output_probs.pop(0)
+            writer.write(item)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--data", type=str, required=True, help="Path to the data file")
+    parser.add_argument("-t", "--type", type=str, required=True, help="Type of the model", choices=["local", "online"])
+    parser.add_argument("-m", "--model", type=str, required=True)
+    parser.add_argument("-p", "--peft", type=str, help="Path to the PEFT file")
+    parser.add_argument("-u", "--url", type=str, help="URL of the model")
+    parser.add_argument("-k", "--key", type=str, help="API key")
+    parser.add_argument("-o", "--output", type=str, required=True, help="Path to the output file")
+    args = parser.parse_args()
+    main(args)

evaluate/dataset/C_C++_Java_Python/test.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

evaluate/result.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+,All_Acc,All_F1,All_MCC,C/C++_Acc,C/C++_F1,C/C++_MCC,Java_Acc,Java_F1,Java_MCC,Python_Acc,Python_F1,Python_MCC
+codellama-7b,84.31\%,6.14\%,-2.37\%,85.37\%,8.25\%,0.31\%,80.71\%,2.96\%,-7.70\%,84.84\%,3.49\%,-4.34\%
+qwen-coder-plus,42.78\%,16.10\%,6.26\%,36.25\%,17.19\%,9.79\%,46.47\%,8.27\%,-14.31\%,60.97\%,21.08\%,22.63\%
+qwen-7b,35.36\%,15.29\%,4.30\%,31.55\%,17.33\%,11.32\%,41.71\%,7.30\%,-18.04\%,41.08\%,15.37\%,14.05\%
+qwen-0.5b,81.82\%,8.50\%,-1.08\%,82.40\%,9.62\%,0.30\%,79.00\%,4.29\%,-7.24\%,83.13\%,10.22\%,2.53\%
+patchouli,94.58\%,56.61\%,55.71\%,93.03\%,44.27\%,42.29\%,98.29\%,89.82\%,89.25\%,95.61\%,40.74\%,49.44\%
+patchouli_msg,94.77\%,57.87\%,57.22\%,93.22\%,44.81\%,43.33\%,98.35\%,90.28\%,89.64\%,95.88\%,46.43\%,53.82\%
+patchouli_nomsg,94.40\%,55.36\%,54.22\%,92.83\%,43.75\%,41.33\%,98.24\%,89.36\%,88.86\%,95.34\%,34.62\%,44.66\%
+dsc-7b,67.99\%,10.28\%,-2.72\%,67.32\%,10.38\%,-2.85\%,69.71\%,6.87\%,-8.29\%,68.31\%,13.48\%,6.04\%
+deepseek-chat,36.46\%,16.30\%,7.38\%,37.96\%,17.10\%,9.21\%,35.35\%,19.25\%,9.78\%,32.58\%,9.90\%,-3.32\%
+gpt-3.5-turbo,55.14\%,13.73\%,1.12\%,53.03\%,15.49\%,4.27\%,55.82\%,7.40\%,-12.42\%,61.59\%,14.11\%,7.54\%

evaluate/statistic.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import os
+import jsonlines
+from collections import defaultdict
+import pandas as pd
+from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef, confusion_matrix
+RESULT_ROOTS = "./result"
+LANGUAGE_MAP = {
+    "all": "All",
+    "C": "C/C++",
+    "C++": "C/C++",
+    "Java": "Java",
+    "Python": "Python",
+}
+table_dict = {}
+for method in os.listdir(RESULT_ROOTS):
+    msg_labels = defaultdict(list)
+    msg_predicts = defaultdict(list)
+    msg_metrics = {}
+    nomsg_labels = defaultdict(list)
+    nomsg_predicts = defaultdict(list)
+    nomsg_metrics = {}
+    mix_labels = defaultdict(list)
+    mix_predicts = defaultdict(list)
+    mix_metrics = {}
+    msg_result_file = os.path.join(RESULT_ROOTS, method, "msg.jsonl")
+    nomsg_result_file = os.path.join(RESULT_ROOTS, method, "nomsg.jsonl")
+    if not os.path.exists(msg_result_file) or not os.path.exists(nomsg_result_file):
+        continue
+    with jsonlines.open(msg_result_file) as reader:
+        for item in reader:
+            lang = LANGUAGE_MAP[item["language"]]
+            for section in item["sections"]:
+                msg_labels["All"].append(section['related'])
+                msg_predicts["All"].append(section['predict'])
+                msg_labels[lang].append(section['related'])
+                msg_predicts[lang].append(section['predict'])
+                mix_labels["All"].append(section['related'])
+                mix_predicts["All"].append(section['predict'])
+                mix_labels[lang].append(section['related'])
+                mix_predicts[lang].append(section['predict'])
+    with jsonlines.open(nomsg_result_file) as reader:
+        for item in reader:
+            lang = LANGUAGE_MAP[item["language"]]
+            for section in item["sections"]:
+                nomsg_labels["All"].append(section['related'])
+                nomsg_predicts["All"].append(section['predict'])
+                nomsg_labels[lang].append(section['related'])
+                nomsg_predicts[lang].append(section['predict'])
+                mix_labels["All"].append(section['related'])
+                mix_predicts["All"].append(section['predict'])
+                mix_labels[lang].append(section['related'])
+                mix_predicts[lang].append(section['predict'])
+    for lang in LANGUAGE_MAP.values():
+        accuracy = accuracy_score(msg_labels[lang], msg_predicts[lang])
+        # precision = precision_score(msg_labels[lang], msg_predicts[lang])
+        # recall = recall_score(msg_labels[lang], msg_predicts[lang])
+        f1 = f1_score(msg_labels[lang], msg_predicts[lang])
+        mcc = matthews_corrcoef(msg_labels[lang], msg_predicts[lang])
+        tp, fp, tn, fn = confusion_matrix(msg_labels[lang], msg_predicts[lang]).ravel()
+        fpr = fp / (fp + tn + 1e-6)
+        msg_metrics.update({
+            f"{lang}_Acc": f"{accuracy * 100:.2f}\\%",
+            # f"{lang}_P": f"{precision * 100:.2f}%",
+            # f"{lang}_R": f"{recall * 100:.2f}%",
+            f"{lang}_F1": f"{f1 * 100:.2f}\\%",
+            # f"{lang}_FPR": f"{fpr * 100:.2f}\\%",
+            f"{lang}_MCC": f"{mcc * 100:.2f}\\%"
+        })
+        accuracy = accuracy_score(nomsg_labels[lang], nomsg_predicts[lang])
+        # precision = precision_score(nomsg_labels[lang], nomsg_predicts[lang])
+        # recall = recall_score(nomsg_labels[lang], nomsg_predicts[lang])
+        f1 = f1_score(nomsg_labels[lang], nomsg_predicts[lang])
+        mcc = matthews_corrcoef(nomsg_labels[lang], nomsg_predicts[lang])
+        tp, fp, tn, fn = confusion_matrix(nomsg_labels[lang], nomsg_predicts[lang]).ravel()
+        fpr = fp / (fp + tn + 1e-6)
+        nomsg_metrics.update({
+            f"{lang}_Acc": f"{accuracy * 100:.2f}\\%",
+            # f"{lang}_P": f"{precision * 100:.2f}%",
+            # f"{lang}_R": f"{recall * 100:.2f}%",
+            f"{lang}_F1": f"{f1 * 100:.2f}\\%",
+            # f"{lang}_FPR": f"{fpr * 100:.2f}\\%",
+            f"{lang}_MCC": f"{mcc * 100:.2f}\\%"
+        })
+        accuracy = accuracy_score(mix_labels[lang], mix_predicts[lang])
+        # precision = precision_score(mix_labels[lang], mix_predicts[lang])
+        # recall = recall_score(mix_labels[lang], mix_predicts[lang])
+        f1 = f1_score(mix_labels[lang], mix_predicts[lang])
+        mcc = matthews_corrcoef(mix_labels[lang], mix_predicts[lang])
+        tp, fp, tn, fn = confusion_matrix(mix_labels[lang], mix_predicts[lang]).ravel()
+        fpr = fp / (fp + tn + 1e-6)
+        mix_metrics.update({
+            f"{lang}_Acc": f"{accuracy * 100:.2f}\\%",
+            # f"{lang}_P": f"{precision * 100:.2f}%",
+            # f"{lang}_R": f"{recall * 100:.2f}%",
+            f"{lang}_F1": f"{f1 * 100:.2f}\\%",
+            # f"{lang}_FPR": f"{fpr * 100:.2f}\\%",
+            f"{lang}_MCC": f"{mcc * 100:.2f}\\%"
+        })
+    table_dict[method] = mix_metrics
+    if method == "patchouli":
+        table_dict[f"{method}_msg"] = msg_metrics
+        table_dict[f"{method}_nomsg"] = nomsg_metrics
+df = pd.DataFrame(table_dict).T
+df.to_csv("result.csv")

evaluate_local.sh ADDED Viewed

	@@ -0,0 +1,54 @@

+export HF_HUB_OFFLINE=1
+export DATASET_ROOT="./evaluate/dataset/C_C++_Java_Python"
+export RESULT_ROOT="./evaluate/result"
+export EVAL_SCRIPT="./dataset_eval.py"
+export MODEL="Qwen/Qwen2.5-Coder-0.5B-Instruct"
+export PEFT="./backend/model/PEFT/patchouli-qwc2.5-0.5b"
+export OUTPUT_DIR="$RESULT_ROOT/patchouli"
+mkdir -p $OUTPUT_DIR
+export INCLUDE_MSG="no"
+python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -p $PEFT -o "$OUTPUT_DIR/nomsg.jsonl" 2>&1 | tee $OUTPUT_DIR/nomsg.log
+export INCLUDE_MSG="yes"
+python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -p $PEFT -o "$OUTPUT_DIR/msg.jsonl" 2>&1 | tee $OUTPUT_DIR/msg.log
+export MODEL="Qwen/Qwen2.5-Coder-0.5B-Instruct"
+export OUTPUT_DIR="$RESULT_ROOT/qwen-0.5b"
+mkdir -p $OUTPUT_DIR
+export INCLUDE_MSG="no"
+python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -o "$OUTPUT_DIR/nomsg.jsonl" 2>&1 | tee $OUTPUT_DIR/nomsg.log
+export INCLUDE_MSG="yes"
+python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -o "$OUTPUT_DIR/msg.jsonl" 2>&1 | tee $OUTPUT_DIR/msg.log
+export MODEL="Qwen/Qwen2.5-Coder-7B-Instruct"
+export OUTPUT_DIR="$RESULT_ROOT/qwen-7b"
+mkdir -p $OUTPUT_DIR
+export INCLUDE_MSG="no"
+python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -o "$OUTPUT_DIR/nomsg.jsonl" 2>&1 | tee $OUTPUT_DIR/nomsg.log
+export INCLUDE_MSG="yes"
+python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -o "$OUTPUT_DIR/msg.jsonl" 2>&1 | tee $OUTPUT_DIR/msg.log
+export MODEL="deepseek-ai/deepseek-coder-7b-instruct-v1.5"
+export OUTPUT_DIR="$RESULT_ROOT/dsc-7b"
+mkdir -p $OUTPUT_DIR
+export INCLUDE_MSG="no"
+python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -o "$OUTPUT_DIR/nomsg.jsonl" 2>&1 | tee $OUTPUT_DIR/nomsg.log
+export INCLUDE_MSG="yes"
+python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -o "$OUTPUT_DIR/msg.jsonl" 2>&1 | tee $OUTPUT_DIR/msg.log
+export MODEL="codellama/CodeLlama-7b-Instruct-hf"
+export OUTPUT_DIR="$RESULT_ROOT/codellama-7b"
+mkdir -p $OUTPUT_DIR
+export INCLUDE_MSG="no"
+python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -o "$OUTPUT_DIR/nomsg.jsonl" 2>&1 | tee $OUTPUT_DIR/nomsg.log
+export INCLUDE_MSG="yes"
+python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t local -m $MODEL -o "$OUTPUT_DIR/msg.jsonl" 2>&1 | tee $OUTPUT_DIR/msg.log

evaluate_online.sh ADDED Viewed

	@@ -0,0 +1,37 @@

+export HF_HUB_OFFLINE=1
+export DATASET_ROOT="./evaluate/dataset/C_C++_Java_Python"
+export RESULT_ROOT="./evaluate/result"
+export EVAL_SCRIPT="./dataset_eval.py"
+GPT-3.5-turbo
+export KEY=""
+export URL="https://api.chatanywhere.tech/v1/"
+export MODEL="gpt-3.5-turbo"
+mkdir -p $RESULT_ROOT/$MODEL
+export INCLUDE_MSG=no
+python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t online -m $MODEL -u $URL -k $KEY -o "$RESULT_ROOT/$MODEL/nomsg.jsonl" 2>&1 | tee $RESULT_ROOT/$MODEL/nomsg.log
+export INCLUDE_MSG=yes
+python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t online -m $MODEL -u $URL -k $KEY -o "$RESULT_ROOT/$MODEL/msg.jsonl" 2>&1 | tee $RESULT_ROOT/$MODEL/msg.log
+# DeepSeek-V3
+export KEY=""
+export URL="https://api.deepseek.com/v1"
+export MODEL="deepseek-chat"
+mkdir -p $RESULT_ROOT/$MODEL
+export INCLUDE_MSG=no
+python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t online -m $MODEL -u $URL -k $KEY -o "$RESULT_ROOT/$MODEL/nomsg.jsonl" 2>&1 | tee $RESULT_ROOT/$MODEL/nomsg.log
+export INCLUDE_MSG=yes
+python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t online -m $MODEL -u $URL -k $KEY -o "$RESULT_ROOT/$MODEL/msg.jsonl" 2>&1 | tee $RESULT_ROOT/$MODEL/msg.log
+qwen-coder-plus
+export KEY=""
+export URL="https://dashscope.aliyuncs.com/compatible-mode/v1"
+export MODEL="qwen-coder-plus"
+mkdir -p $RESULT_ROOT/$MODEL
+export INCLUDE_MSG=no
+python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t online -m $MODEL -u $URL -k $KEY -o "$RESULT_ROOT/$MODEL/nomsg.jsonl" 2>&1 | tee $RESULT_ROOT/$MODEL/nomsg.log
+export INCLUDE_MSG=yes
+python $EVAL_SCRIPT -d $DATASET_ROOT/test.jsonl -t online -m $MODEL -u $URL -k $KEY -o "$RESULT_ROOT/$MODEL/msg.jsonl" 2>&1 | tee $RESULT_ROOT/$MODEL/msg.log

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio==5.12.0
+jsonlines==4.0.0
+openai==1.59.6
+peft==0.12.0
+scikit_learn==1.6.1
+torch==2.5.1
+tqdm==4.67.1
+transformers==4.46.1
+tensorboard