Spaces:

nevi1
/

lm_detect

No application file

App Files Files Community

nevi1 commited on Jan 3

Commit

73f4c20

•

1 Parent(s): e677844

Upload 244 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
lm-watermarking-main/.gitignore +129 -0
lm-watermarking-main/LICENSE.md +201 -0
lm-watermarking-main/MANIFEST.in +7 -0
lm-watermarking-main/README.md +119 -0
lm-watermarking-main/__pycache__/alternative_prf_schemes.cpython-310.pyc +0 -0
lm-watermarking-main/__pycache__/demo_watermark.cpython-310.pyc +0 -0
lm-watermarking-main/__pycache__/demo_watermark.cpython-39.pyc +0 -0
lm-watermarking-main/__pycache__/extended_watermark_processor.cpython-310.pyc +0 -0
lm-watermarking-main/__pycache__/extended_watermark_processor.cpython-39.pyc +0 -0
lm-watermarking-main/__pycache__/homoglyphs.cpython-310.pyc +0 -0
lm-watermarking-main/__pycache__/normalizers.cpython-310.pyc +0 -0
lm-watermarking-main/alternative_prf_schemes.py +184 -0
lm-watermarking-main/app.py +52 -0
lm-watermarking-main/demo_watermark.py +702 -0
lm-watermarking-main/experiments/README.md +91 -0
lm-watermarking-main/experiments/io_utils.py +116 -0
lm-watermarking-main/experiments/launch.py +222 -0
lm-watermarking-main/experiments/process_rows.py +250 -0
lm-watermarking-main/experiments/run_watermarking.py +705 -0
lm-watermarking-main/experiments/submitit_utils.py +79 -0
lm-watermarking-main/experiments/watermark.py +820 -0
lm-watermarking-main/experiments/watermarking_analysis.ipynb +2049 -0
lm-watermarking-main/experiments/watermarking_example_finding.ipynb +1007 -0
lm-watermarking-main/extended_watermark_processor.py +625 -0
lm-watermarking-main/homoglyph_data/__init__.py +40 -0
lm-watermarking-main/homoglyph_data/categories.json +0 -0
lm-watermarking-main/homoglyph_data/confusables_sept2022.json +0 -0
lm-watermarking-main/homoglyph_data/languages.json +34 -0
lm-watermarking-main/homoglyphs.py +265 -0
lm-watermarking-main/normalizers.py +208 -0
lm-watermarking-main/pyproject.toml +6 -0
lm-watermarking-main/requirements.txt +6 -0
lm-watermarking-main/setup.cfg +68 -0
lm-watermarking-main/watermark_processor.py +282 -0
lm-watermarking-main/watermark_reliability_release/PIPELINE.md +154 -0
lm-watermarking-main/watermark_reliability_release/README.md +27 -0
lm-watermarking-main/watermark_reliability_release/alternative_prf_schemes.py +172 -0
lm-watermarking-main/watermark_reliability_release/attack_pipeline.py +506 -0
lm-watermarking-main/watermark_reliability_release/broadcast_token_prefixes.py +436 -0
lm-watermarking-main/watermark_reliability_release/detectgpt/debug.sh +77 -0
lm-watermarking-main/watermark_reliability_release/detectgpt/detectgpt_main.py +807 -0
lm-watermarking-main/watermark_reliability_release/detectgpt/make_plot.py +124 -0
lm-watermarking-main/watermark_reliability_release/detectgpt/plot.sh +46 -0
lm-watermarking-main/watermark_reliability_release/detectgpt/run_detectgpt.sh +63 -0
lm-watermarking-main/watermark_reliability_release/evaluation_pipeline.py +1330 -0
lm-watermarking-main/watermark_reliability_release/figure_notebooks/baseline_comparison.ipynb +0 -0
lm-watermarking-main/watermark_reliability_release/figure_notebooks/baseline_comparison_transpose.ipynb +0 -0
lm-watermarking-main/watermark_reliability_release/figure_notebooks/core_robustness.ipynb +0 -0
lm-watermarking-main/watermark_reliability_release/figure_notebooks/data_model.ipynb +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+lm-watermarking-main/watermark_reliability_release/utils/data/lfqa.jsonl filter=lfs diff=lfs merge=lfs -text

lm-watermarking-main/.gitignore ADDED Viewed

	@@ -0,0 +1,129 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

lm-watermarking-main/LICENSE.md ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [2023] [Authors of 'A Watermark for Large Language Models']
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

lm-watermarking-main/MANIFEST.in ADDED Viewed

	@@ -0,0 +1,7 @@

+# added manually
+include *.py
+include *.json
+include *.md
+global-exclude *.pyc
+global-exclude __pycache__

lm-watermarking-main/README.md ADDED Viewed

	@@ -0,0 +1,119 @@

+# 💧 [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226) 🔍
+### [Demo](https://huggingface.co/spaces/tomg-group-umd/lm-watermarking) | [Paper](https://arxiv.org/abs/2301.10226)
+Official implementation of the watermarking and detection algorithms presented in the paper:
+"A Watermark for Large language Models" by _John Kirchenbauer*, Jonas Geiping*, Yuxin Wen, Jonathan Katz, Ian Miers, Tom Goldstein_
+### Updates:
+- **(6/7/23)** We're thrilled to announce the release of ["On the Reliability of Watermarks for Large Language Models"](https://arxiv.org/abs/2306.04634) Our new preprint documents a deep dive into the robustness properties of more advanced watermarks.
+- **(6/9/23)** Initial code release implementing the alternate watermark and detector variants in the new work. Files located in this subdirectory: [`watermark_reliability_release`](watermark_reliability_release).
+- **(9/23/23)** Update to the docs with recommendations on parameter settings. Extended implementation (recommended) available in `extended_watermark_processor.py`.
+---
+Implementation is based on the "logit processor" abstraction provided by the [huggingface/transformers 🤗](https://github.com/huggingface/transformers) library.
+The `WatermarkLogitsProcessor` is designed to be readily compatible with any model that supports the `generate` API.
+Any model that can be constructed using the `AutoModelForCausalLM` or `AutoModelForSeq2SeqLM` factories _should_ be compatible.
+### Repo contents
+The core implementation is defined by the `WatermarkBase`, `WatermarkLogitsProcessor`, and `WatermarkDetector` classes in the files `watermark_processor.py`, for a minimal implementation and `extended_watermark_processor.py` for the more full featured implementation (recommended).
+The `demo_watermark.py` script implements a gradio demo interface as well as minimum working example in the `main` function using the minimal version.
+Details about the parameters and the detection outputs are provided in the gradio app markdown blocks as well as the argparse definition.
+The `homoglyphs.py` and `normalizers.py` modules implement algorithms used by the `WatermarkDetector`. `homoglyphs.py` (and its raw data in `homoglyph_data`) is an updated version of the homoglyph code from the deprecated package described here: https://github.com/life4/homoglyphs.
+The `experiments` directory contains pipeline code that we used to run the original experiments in the paper. However this is stale/deprecated
+in favor of the implementation in `watermark_processor.py`.
+### Demo Usage
+As a quickstart, the app can be launched with default args (or deployed to a [huggingface Space](https://huggingface.co/spaces)) using `app.py`
+which is just a thin wrapper around the demo script.
+```sh
+python app.py
+gradio app.py # for hot reloading
+# or
+python demo_watermark.py --model_name_or_path facebook/opt-6.7b
+```
+### How to Watermark - A short guide on watermark hyperparameters
+What watermark hyperparameters are optimal for your task or for a comparison to new watermarks? We'll provide a brief overview about all important settings below, and best practices for future work. This guide represents our current understanding of optimal settings as of August 2023, and so is a bit more up to date than our ICML 2023 conference paper.
+**TL;DR**: As a baseline generation setting, we suggest default values of `gamma=0.25` and `delta=2.0`. Reduce delta if text quality is negatively impacted. For the context width, h, we recommend a moderate value, i.e. h=4, and as a default PRF we recommend `selfhash`, but can use `minhash` if you want. Reduce h if more robustness against edits is required. Note however that the choice of PRF only matters if h>1. The recommended PRF and context width can be easily selected by instantiating the watermark processor and detector with `seeding_scheme="selfhash"` (a shorthand for `seeding_scheme="ff-anchored_minhash_prf-4-True-15485863"`, but do use a different base key if actually deploying). For detection, always run with `--ignore--repeated-ngrams=True`.
+1) **Logit bias delta**: The magnitude of delta determines the strength of the watermark. A sufficiently large value of delta recovers a "hard" watermark that encodes 1 bit of information at every token, but this is not an advisable setting, as it strongly affects model quality. A moderate delta in the range of [0.5, 2.0] is appropriate for normal use cases, but the strength of delta is relative to the entropy of the output distribution. Models that are overconfident, such as instruction-tuned models, may benefit from choosing a larger delta value. With non-infinite delta values, the watermark strength is directly proportional to the (spike) entropy of the text and exp(delta) (see Theorem 4.2 in our paper).
+2) **Context width h**: Context width is the length of the context which is taken into account when seeding the watermark at each location. The longer the context, the "more random" the red/green list partitions are, and the less detectable the watermark is. For private watermarks, this implies that the watermark is harder to discover via brute-force (with an exponential increase in hardness with increasing context width h).
+In the limit of a very long context width, we approach the "undetectable" setting of https://eprint.iacr.org/2023/763. However, the longer the context width, the less "nuclear" the watermark is, and robustness to paraphrasing and other attacks decreases. In the limit of h=0, the watermark is independent of local context and, as such, it is minimally random, but maximally robust against edits (see https://arxiv.org/abs/2306.17439).
+3) **Ignoring repeated ngrams**: The watermark is only pseudo-random based on the local context. Whenever local context repeats, this constitutes a violation of the assumption that the PRNG numbers used to seed the green/red partition operation are drawn iid. (See Sec.4. in our paper for details). For this reason, p-values for text with repeated n-grams (n-gram here meaning context + chosen token) will be misleading. As such, detection should be run with `--ignore-repeated-ngrams` set to `True`. An additional, detailed analysis of this effect can be found in http://arxiv.org/abs/2308.00113.
+4) **Choice of pseudo-random-function** (PRF): This choice is only relevant if context width h>1 and determines the robustness of the hash of the context against edits. In our experiments we find "min"-hash PRFs to be the most performant in striking a balance between maximizing robustness and minimizing impact on text quality. In comparison to a PRF that depends on the entire context, this PRF only depends on a single, randomly chosen token from the context.
+5) **Self-Hashing**: It is possible to extend the context width of the watermark onto the current token. This effectively extends the context width "for-free" by one. The only downside is that this approach requires hashing all possible next tokens, and applying the logit bias only to tokens where their inclusion in the context would produce a hash that includes this token on the green list. This is slow in the way we implement it, because we use cuda's pseudorandom number generator and a simple inner-loop implementation, but in principle has a negligible cost, compared to generating new tokens if engineered for deployment. A generalized algorithm for self-hashing can be found as Alg.1 in http://arxiv.org/abs/2306.04634.
+6) **Gamma**: Gamma denotes the fraction of the vocabulary that will be in each green list. We find gamma=0.25 to be slightly more optimal empirically, but this is a minor effect and reasonable values of gamma between 0.25 and 0.75 will lead to reasonable watermark. A intuitive argument can be made for why this makes it easier to achieve a fraction of green tokens sufficiently higher than gamma to reject the null hypothesis, when you choose a lower gamma value.
+7) **Base Key**: Our watermark is salted with a small base key of 15485863 (the millionth prime). If you deploy this watermark, we do not advise re-using this key.
+### How to use the watermark in your own code.
+Our implementation can be added into any huggingface generation pipeline as an additional `LogitProcessor`, only the classes `WatermarkLogitsProcessor` and `WatermarkDetector` from the `extended_watermark_processor.py` file are required.
+Example snippet to generate watermarked text:
+```python
+from extended_watermark_processor import WatermarkLogitsProcessor
+watermark_processor = WatermarkLogitsProcessor(vocab=list(tokenizer.get_vocab().values()),
+                                               gamma=0.25,
+                                               delta=2.0,
+                                               seeding_scheme="selfhash") #equivalent to `ff-anchored_minhash_prf-4-True-15485863`
+# Note:
+# You can turn off self-hashing by setting the seeding scheme to `minhash`.
+tokenized_input = tokenizer(input_text).to(model.device)
+# note that if the model is on cuda, then the input is on cuda
+# and thus the watermarking rng is cuda-based.
+# This is a different generator than the cpu-based rng in pytorch!
+output_tokens = model.generate(**tokenized_input,
+                               logits_processor=LogitsProcessorList([watermark_processor]))
+# if decoder only model, then we need to isolate the
+# newly generated tokens as only those are watermarked, the input/prompt is not
+output_tokens = output_tokens[:,tokenized_input["input_ids"].shape[-1]:]
+output_text = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)[0]
+```
+Example snippet to detect watermarked text:
+```python
+from extended_watermark_processor import WatermarkDetector
+watermark_detector = WatermarkDetector(vocab=list(tokenizer.get_vocab().values()),
+                                        gamma=0.25, # should match original setting
+                                        seeding_scheme="selfhash", # should match original setting
+                                        device=model.device, # must match the original rng device type
+                                        tokenizer=tokenizer,
+                                        z_threshold=4.0,
+                                        normalizers=[],
+                                        ignore_repeated_ngrams=True)
+score_dict = watermark_detector.detect(output_text) # or any other text of interest to analyze
+```
+To recover the main settings of the experiments in the original work (for historical reasons), use the seeding scheme `simple_1` and set `ignore_repeated_ngrams=False` at detection time.
+### Contributing
+Suggestions and PR's welcome 🙂

lm-watermarking-main/__pycache__/alternative_prf_schemes.cpython-310.pyc ADDED Viewed

Binary file (4.92 kB). View file

lm-watermarking-main/__pycache__/demo_watermark.cpython-310.pyc ADDED Viewed

Binary file (28.4 kB). View file

lm-watermarking-main/__pycache__/demo_watermark.cpython-39.pyc ADDED Viewed

Binary file (28.5 kB). View file

lm-watermarking-main/__pycache__/extended_watermark_processor.cpython-310.pyc ADDED Viewed

Binary file (18.3 kB). View file

lm-watermarking-main/__pycache__/extended_watermark_processor.cpython-39.pyc ADDED Viewed

Binary file (17.8 kB). View file

lm-watermarking-main/__pycache__/homoglyphs.cpython-310.pyc ADDED Viewed

Binary file (7.79 kB). View file

lm-watermarking-main/__pycache__/normalizers.cpython-310.pyc ADDED Viewed

Binary file (6.86 kB). View file

lm-watermarking-main/alternative_prf_schemes.py ADDED Viewed

	@@ -0,0 +1,184 @@

+"""Implement other PRF functions (These all vary only how they generate a single hash from the tokens in the context).
+Can be hooked into existing WatermarkLogitsProcessor as modified base class WatermarkBase, see implementation in
+extended_watermark_processor.py
+"""
+# coding=utf-8
+# Copyright 2023 Authors of "A Watermark for Large Language Models"
+# available at https://arxiv.org/abs/2301.10226
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from itertools import combinations
+from functools import cache
+# Key properties of a hashing scheme
+props = {
+    "prf_type": str,  # string name of the underlying PRF mapping multiple token ids to a random seed
+    "context_width": int,  # this is h in the paper, how many previous tokens should be considered for each PRF
+    "self_salt": bool,  # Use the rules laid in robust-watermarking to use the token itself to seed and possibly reject its own list
+    "hash_key": int,  # integer, large prime, used to move seed away from low-entrop bit sequences in PRF chosen above
+}
+def seeding_scheme_lookup(seeding_scheme: str):
+    if not isinstance(seeding_scheme, str):
+        raise ValueError("Seeding scheme should be a string summarizing the procedure.")
+    if seeding_scheme == "simple_1" or seeding_scheme == "lefthash":
+        # Default, simple bigram hash  # alias for ff-additive_prf-1-False-15485863
+        prf_type = "additive_prf"
+        context_width = 1
+        self_salt = False
+        hash_key = 15485863
+    elif seeding_scheme == "algorithm-3" or seeding_scheme == "selfhash":
+        prf_type = "anchored_minhash_prf"
+        context_width = 4
+        self_salt = True
+        hash_key = 15485863
+    elif seeding_scheme == "minhash":
+        prf_type = "minhash_prf"
+        context_width = 4
+        self_salt = False
+        hash_key = 15485863
+    elif seeding_scheme == "skipgram":
+        prf_type = "skipgram_prf"
+        context_width = 5
+        self_salt = False
+        hash_key = 15485863
+    elif seeding_scheme.startswith("ff"):  # freeform seeding scheme API - only use for experimenting
+        # expects strings of the form ff-additive_prf-4-True-hash or ff-additive_prf-5-True (hash key is optional)
+        split_scheme = seeding_scheme.split("-")
+        prf_type = str(split_scheme[1])
+        context_width = int(split_scheme[2])
+        self_salt = split_scheme[3] == "True"
+        if len(split_scheme) == 5:
+            hash_key = int(split_scheme[4])
+        else:
+            hash_key = 15485863
+    else:
+        raise ValueError(f"Invalid seeding scheme name {seeding_scheme} given. Try  'simple_1'?")
+    assert prf_type in prf_lookup.keys()
+    return prf_type, context_width, self_salt, hash_key
+def multiplicative_prf(input_ids: torch.LongTensor, salt_key: int) -> int:
+    return salt_key * input_ids.prod().item()
+def additive_prf(input_ids: torch.LongTensor, salt_key: int) -> int:
+    return salt_key * input_ids.sum().item()
+def minfunc_prf(input_ids: torch.LongTensor, salt_key: int) -> int:
+    # not a great idea for non-random input ids as in text
+    return salt_key * input_ids.min().item()
+def simple_skip_prf(input_ids: torch.LongTensor, salt_key: int, k=2) -> int:
+    # k is the skip distance
+    return hashint(salt_key * input_ids[::k]).prod().item()
+def skipgram_prf(input_ids: torch.LongTensor, salt_key: int) -> int:
+    # maximum distance skipgram within context
+    return hashint(salt_key * input_ids[0]).item()
+def anchored_skipgram_prf(input_ids: torch.LongTensor, salt_key: int, anchor: int = -1) -> int:
+    # maximum distance skipgram within context
+    return (hashint(salt_key * input_ids[0]) * hashint(salt_key * input_ids[anchor])).item()
+def minhash_prf(input_ids: torch.LongTensor, salt_key: int) -> int:
+    # slightly less not the greatest idea for non-random input ids as in text
+    return hashint(salt_key * input_ids).min().item()
+def anchored_minhash_prf(input_ids: torch.LongTensor, salt_key: int, anchor: int = -1) -> int:
+    # Anchor to one key to produce a min over pairs again
+    return (salt_key * hashint(input_ids) * hashint(input_ids[anchor])).min().item()
+def minskipgram_prf(input_ids: torch.LongTensor, salt_key: int, k: int = 2) -> int:
+    # min over all skipgrams in context, k=2 is all pairs
+    skipgrams = torch.as_tensor(list(combinations(hashint(salt_key * input_ids), 2)))
+    return skipgrams.prod(dim=1).min().item()
+def noncomm_prf(input_ids: torch.LongTensor, salt_key: int, k: int = 2) -> int:
+    key = torch.as_tensor(salt_key, dtype=torch.long)
+    for entry in input_ids:
+        key *= hashint(key * entry)
+        key %= 2**32
+    return key.item()
+def position_prf(input_ids: torch.LongTensor, salt_key: int, k: int = 2) -> int:
+    return (salt_key * input_ids * torch.arange(1, len(input_ids) + 1, device=input_ids.device)).sum().item()
+prf_lookup = {
+    "multiplicative_prf": multiplicative_prf,
+    "additive_prf": additive_prf,
+    "minfunc_prf": minfunc_prf,
+    "simple_skip_prf": simple_skip_prf,
+    "skipgram_prf": skipgram_prf,
+    "anchored_skipgram_prf": anchored_skipgram_prf,
+    "minhash_prf": minhash_prf,
+    "anchored_minhash_prf": anchored_minhash_prf,
+    "minskipgram_prf": minskipgram_prf,
+    "noncomm_prf": noncomm_prf,
+    "position_prf": position_prf,
+}
+# Generate a global permute table once at startup
+rng = torch.Generator(device=torch.device("cpu"))
+rng.manual_seed(2971215073)  # fib47 is prime
+table_size = 1_000_003
+fixed_table = torch.randperm(1_000_003, device=torch.device("cpu"), generator=rng)  # actually faster than I thought
+def hashint(integer_tensor: torch.LongTensor) -> torch.LongTensor:
+    """Sane version, in the end we only need a small permutation table."""
+    return fixed_table[integer_tensor.cpu() % table_size] + 1  # minor cheat here, this function always return CPU values
+def _hashint_avalanche_tensor(integer_tensor: torch.LongTensor):
+    """http://burtleburtle.net/bob/hash/integer.html, ported into pytorch, runs on tensors. Apparently a decent avalanche."""
+    i = integer_tensor.to(torch.int32).clone()  # or torch.int16?
+    i -= i << 6
+    i ^= i >> 17
+    i -= i << 9
+    i ^= i << 4
+    i -= i << 3
+    i ^= i << 10
+    i ^= i >> 15
+    return i.to(torch.long)
+@cache
+def _hashint_avalanche_int(integer: int):
+    """http://burtleburtle.net/bob/hash/integer.html, runs in base python, caches based on access.
+    Does this make sense for signed 64bit ints?"""
+    i = integer % (2**32)
+    i -= i << 6
+    i ^= i >> 17
+    i -= i << 9
+    i ^= i << 4
+    i -= i << 3
+    i ^= i << 10
+    i ^= i >> 15
+    return i

lm-watermarking-main/app.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# coding=utf-8
+# Copyright 2023 Authors of "A Watermark for Large Language Models"
+# available at https://arxiv.org/abs/2301.10226
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from argparse import Namespace
+args = Namespace()
+arg_dict = {
+    'run_gradio': True,
+    'demo_public': False,
+    # 'model_name_or_path': 'facebook/opt-125m',
+    # 'model_name_or_path': 'facebook/opt-1.3b',
+    # 'model_name_or_path': 'facebook/opt-2.7b',
+    'model_name_or_path': 'facebook/opt-6.7b',
+    # 'model_name_or_path': 'facebook/opt-13b',
+    # 'load_fp16' : True,
+    'load_fp16' : False,
+    'prompt_max_length': None,
+    'max_new_tokens': 200,
+    'generation_seed': 123,
+    'use_sampling': True,
+    'n_beams': 1,
+    'sampling_temp': 0.7,
+    'use_gpu': True,
+    'seeding_scheme': 'simple_1',
+    'gamma': 0.25,
+    'delta': 2.0,
+    'normalizers': '',
+    'ignore_repeated_bigrams': False,
+    'detection_z_threshold': 4.0,
+    'select_green_tokens': True,
+    'skip_model_load': False,
+    'seed_separately': True,
+}
+args.__dict__.update(arg_dict)
+from demo_watermark import main
+main(args)

lm-watermarking-main/demo_watermark.py ADDED Viewed

	@@ -0,0 +1,702 @@

+# coding=utf-8
+# Copyright 2023 Authors of "A Watermark for Large Language Models"
+# available at https://arxiv.org/abs/2301.10226
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import argparse
+from argparse import Namespace
+from pprint import pprint
+from functools import partial
+import numpy # for gradio hot reload
+import gradio as gr
+import torch
+from transformers import (AutoTokenizer,
+                          AutoModelForSeq2SeqLM,
+                          AutoModelForCausalLM,
+                          LogitsProcessorList)
+from watermark_processor import WatermarkLogitsProcessor, WatermarkDetector
+def str2bool(v):
+    """Util function for user friendly boolean flag args"""
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+def parse_args():
+    """Command line argument specification"""
+    parser = argparse.ArgumentParser(description="A minimum working example of applying the watermark to any LLM that supports the huggingface 🤗 `generate` API")
+    parser.add_argument(
+        "--run_gradio",
+        type=str2bool,
+        default=True,
+        help="Whether to launch as a gradio demo. Set to False if not installed and want to just run the stdout version.",
+    )
+    parser.add_argument(
+        "--demo_public",
+        type=str2bool,
+        default=False,
+        help="Whether to expose the gradio demo to the internet.",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default="facebook/opt-6.7b",
+        help="Main model, path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--prompt_max_length",
+        type=int,
+        default=None,
+        help="Truncation length for prompt, overrides model config's max length field.",
+    )
+    parser.add_argument(
+        "--max_new_tokens",
+        type=int,
+        default=200,
+        help="Maximmum number of new tokens to generate.",
+    )
+    parser.add_argument(
+        "--generation_seed",
+        type=int,
+        default=123,
+        help="Seed for setting the torch global rng prior to generation.",
+    )
+    parser.add_argument(
+        "--use_sampling",
+        type=str2bool,
+        default=True,
+        help="Whether to generate using multinomial sampling.",
+    )
+    parser.add_argument(
+        "--sampling_temp",
+        type=float,
+        default=0.7,
+        help="Sampling temperature to use when generating using multinomial sampling.",
+    )
+    parser.add_argument(
+        "--n_beams",
+        type=int,
+        default=1,
+        help="Number of beams to use for beam search. 1 is normal greedy decoding",
+    )
+    parser.add_argument(
+        "--use_gpu",
+        type=str2bool,
+        default=True,
+        help="Whether to run inference and watermark hashing/seeding/permutation on gpu.",
+    )
+    parser.add_argument(
+        "--seeding_scheme",
+        type=str,
+        default="simple_1",
+        help="Seeding scheme to use to generate the greenlists at each generation and verification step.",
+    )
+    parser.add_argument(
+        "--gamma",
+        type=float,
+        default=0.25,
+        help="The fraction of the vocabulary to partition into the greenlist at each generation and verification step.",
+    )
+    parser.add_argument(
+        "--delta",
+        type=float,
+        default=2.0,
+        help="The amount/bias to add to each of the greenlist token logits before each token sampling step.",
+    )
+    parser.add_argument(
+        "--normalizers",
+        type=str,
+        default="",
+        help="Single or comma separated list of the preprocessors/normalizer names to use when performing watermark detection.",
+    )
+    parser.add_argument(
+        "--ignore_repeated_bigrams",
+        type=str2bool,
+        default=False,
+        help="Whether to use the detection method that only counts each unqiue bigram once as either a green or red hit.",
+    )
+    parser.add_argument(
+        "--detection_z_threshold",
+        type=float,
+        default=4.0,
+        help="The test statistic threshold for the detection hypothesis test.",
+    )
+    parser.add_argument(
+        "--select_green_tokens",
+        type=str2bool,
+        default=True,
+        help="How to treat the permuation when selecting the greenlist tokens at each step. Legacy is (False) to pick the complement/reds first.",
+    )
+    parser.add_argument(
+        "--skip_model_load",
+        type=str2bool,
+        default=False,
+        help="Skip the model loading to debug the interface.",
+    )
+    parser.add_argument(
+        "--seed_separately",
+        type=str2bool,
+        default=True,
+        help="Whether to call the torch seed function before both the unwatermarked and watermarked generate calls.",
+    )
+    parser.add_argument(
+        "--load_fp16",
+        type=str2bool,
+        default=False,
+        help="Whether to run model in float16 precsion.",
+    )
+    args = parser.parse_args()
+    return args
+def load_model(args):
+    """Load and return the model and tokenizer"""
+    args.is_seq2seq_model = any([(model_type in args.model_name_or_path) for model_type in ["t5","T0"]])
+    args.is_decoder_only_model = any([(model_type in args.model_name_or_path) for model_type in ["gpt","opt","bloom"]])
+    if args.is_seq2seq_model:
+        model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name_or_path)
+    elif args.is_decoder_only_model:
+        if args.load_fp16:
+            model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,torch_dtype=torch.float16, device_map='auto')
+        else:
+            model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path)
+    else:
+        raise ValueError(f"Unknown model type: {args.model_name_or_path}")
+    if args.use_gpu:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        if args.load_fp16:
+            pass
+        else:
+            model = model.to(device)
+    else:
+        device = "cpu"
+    model.eval()
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+    return model, tokenizer, device
+def generate(prompt, args, model=None, device=None, tokenizer=None):
+    """Instatiate the WatermarkLogitsProcessor according to the watermark parameters
+       and generate watermarked text by passing it to the generate method of the model
+       as a logits processor. """
+    print(f"Generating with {args}")
+    watermark_processor = WatermarkLogitsProcessor(vocab=list(tokenizer.get_vocab().values()),
+                                                    gamma=args.gamma,
+                                                    delta=args.delta,
+                                                    seeding_scheme=args.seeding_scheme,
+                                                    select_green_tokens=args.select_green_tokens)
+    gen_kwargs = dict(max_new_tokens=args.max_new_tokens)
+    if args.use_sampling:
+        gen_kwargs.update(dict(
+            do_sample=True,
+            top_k=0,
+            temperature=args.sampling_temp
+        ))
+    else:
+        gen_kwargs.update(dict(
+            num_beams=args.n_beams
+        ))
+    generate_without_watermark = partial(
+        model.generate,
+        **gen_kwargs
+    )
+    generate_with_watermark = partial(
+        model.generate,
+        logits_processor=LogitsProcessorList([watermark_processor]),
+        **gen_kwargs
+    )
+    if args.prompt_max_length:
+        pass
+    elif hasattr(model.config,"max_position_embedding"):
+        args.prompt_max_length = model.config.max_position_embeddings-args.max_new_tokens
+    else:
+        args.prompt_max_length = 2048-args.max_new_tokens
+    tokd_input = tokenizer(prompt, return_tensors="pt", add_special_tokens=True, truncation=True, max_length=args.prompt_max_length).to(device)
+    truncation_warning = True if tokd_input["input_ids"].shape[-1] == args.prompt_max_length else False
+    redecoded_input = tokenizer.batch_decode(tokd_input["input_ids"], skip_special_tokens=True)[0]
+    torch.manual_seed(args.generation_seed)
+    output_without_watermark = generate_without_watermark(**tokd_input)
+    # optional to seed before second generation, but will not be the same again generally, unless delta==0.0, no-op watermark
+    if args.seed_separately:
+        torch.manual_seed(args.generation_seed)
+    output_with_watermark = generate_with_watermark(**tokd_input)
+    if args.is_decoder_only_model:
+        # need to isolate the newly generated tokens
+        output_without_watermark = output_without_watermark[:,tokd_input["input_ids"].shape[-1]:]
+        output_with_watermark = output_with_watermark[:,tokd_input["input_ids"].shape[-1]:]
+    decoded_output_without_watermark = tokenizer.batch_decode(output_without_watermark, skip_special_tokens=True)[0]
+    decoded_output_with_watermark = tokenizer.batch_decode(output_with_watermark, skip_special_tokens=True)[0]
+    return (redecoded_input,
+            int(truncation_warning),
+            decoded_output_without_watermark,
+            decoded_output_with_watermark,
+            args)
+            # decoded_output_with_watermark)
+def format_names(s):
+    """Format names for the gradio demo interface"""
+    s=s.replace("num_tokens_scored","Tokens Counted (T)")
+    s=s.replace("num_green_tokens","# Tokens in Greenlist")
+    s=s.replace("green_fraction","Fraction of T in Greenlist")
+    s=s.replace("z_score","z-score")
+    s=s.replace("p_value","p value")
+    s=s.replace("prediction","Prediction")
+    s=s.replace("confidence","Confidence")
+    return s
+def list_format_scores(score_dict, detection_threshold):
+    """Format the detection metrics into a gradio dataframe input format"""
+    lst_2d = []
+    # lst_2d.append(["z-score threshold", f"{detection_threshold}"])
+    for k,v in score_dict.items():
+        if k=='green_fraction':
+            lst_2d.append([format_names(k), f"{v:.1%}"])
+        elif k=='confidence':
+            lst_2d.append([format_names(k), f"{v:.3%}"])
+        elif isinstance(v, float):
+            lst_2d.append([format_names(k), f"{v:.3g}"])
+        elif isinstance(v, bool):
+            lst_2d.append([format_names(k), ("Watermarked" if v else "Human/Unwatermarked")])
+        else:
+            lst_2d.append([format_names(k), f"{v}"])
+    if "confidence" in score_dict:
+        lst_2d.insert(-2,["z-score Threshold", f"{detection_threshold}"])
+    else:
+        lst_2d.insert(-1,["z-score Threshold", f"{detection_threshold}"])
+    return lst_2d
+def detect(input_text, args, device=None, tokenizer=None):
+    """Instantiate the WatermarkDetection object and call detect on
+        the input text returning the scores and outcome of the test"""
+    watermark_detector = WatermarkDetector(vocab=list(tokenizer.get_vocab().values()),
+                                        gamma=args.gamma,
+                                        seeding_scheme=args.seeding_scheme,
+                                        device=device,
+                                        tokenizer=tokenizer,
+                                        z_threshold=args.detection_z_threshold,
+                                        normalizers=args.normalizers,
+                                        ignore_repeated_bigrams=args.ignore_repeated_bigrams,
+                                        select_green_tokens=args.select_green_tokens)
+    if len(input_text)-1 > watermark_detector.min_prefix_len:
+        score_dict = watermark_detector.detect(input_text)
+        # output = str_format_scores(score_dict, watermark_detector.z_threshold)
+        output = list_format_scores(score_dict, watermark_detector.z_threshold)
+    else:
+        # output = (f"Error: string not long enough to compute watermark presence.")
+        output = [["Error","string too short to compute metrics"]]
+        output += [["",""] for _ in range(6)]
+    return output, args
+def run_gradio(args, model=None, device=None, tokenizer=None):
+    """Define and launch the gradio demo interface"""
+    generate_partial = partial(generate, model=model, device=device, tokenizer=tokenizer)
+    detect_partial = partial(detect, device=device, tokenizer=tokenizer)
+    with gr.Blocks() as demo:
+        # Top section, greeting and instructions
+        with gr.Row():
+            with gr.Column(scale=9):
+                gr.Markdown(
+                """
+                ## 💧 [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226) 🔍
+                """
+                )
+            with gr.Column(scale=1):
+                gr.Markdown(
+                """
+                [![](https://badgen.net/badge/icon/GitHub?icon=github&label)](https://github.com/jwkirchenbauer/lm-watermarking)
+                """
+                )
+            # with gr.Column(scale=2):
+            #     pass
+            # ![visitor badge](https://visitor-badge.glitch.me/badge?page_id=tomg-group-umd_lm-watermarking) # buggy
+        with gr.Accordion("Understanding the output metrics",open=False):
+            gr.Markdown(
+            """
+            - `z-score threshold` : The cuttoff for the hypothesis test
+            - `Tokens Counted (T)` : The number of tokens in the output that were counted by the detection algorithm.
+                The first token is ommitted in the simple, single token seeding scheme since there is no way to generate
+                a greenlist for it as it has no prefix token(s). Under the "Ignore Bigram Repeats" detection algorithm,
+                described in the bottom panel, this can be much less than the total number of tokens generated if there is a lot of repetition.
+            - `# Tokens in Greenlist` : The number of tokens that were observed to fall in their respective greenlist
+            - `Fraction of T in Greenlist` : The `# Tokens in Greenlist` / `T`. This is expected to be approximately `gamma` for human/unwatermarked text.
+            - `z-score` : The test statistic for the detection hypothesis test. If larger than the `z-score threshold`
+                we "reject the null hypothesis" that the text is human/unwatermarked, and conclude it is watermarked
+            - `p value` : The likelihood of observing the computed `z-score` under the null hypothesis. This is the likelihood of
+                observing the `Fraction of T in Greenlist` given that the text was generated without knowledge of the watermark procedure/greenlists.
+                If this is extremely _small_ we are confident that this many green tokens was not chosen by random chance.
+            -  `prediction` : The outcome of the hypothesis test - whether the observed `z-score` was higher than the `z-score threshold`
+            - `confidence` : If we reject the null hypothesis, and the `prediction` is "Watermarked", then we report 1-`p value` to represent
+                the confidence of the detection based on the unlikeliness of this `z-score` observation.
+            """
+            )
+        with gr.Accordion("A note on model capability",open=True):
+            gr.Markdown(
+                """
+                This demo uses open-source language models that fit on a single GPU. These models are less powerful than proprietary commercial tools like ChatGPT, Claude, or Bard.
+                Importantly, we use a language model that is designed to "complete" your prompt, and not a model this is fine-tuned to follow instructions.
+                For best results, prompt the model with a few sentences that form the beginning of a paragraph, and then allow it to "continue" your paragraph.
+                Some examples include the opening paragraph of a wikipedia article, or the first few sentences of a story.
+                Longer prompts that end mid-sentence will result in more fluent generations.
+                """
+                )
+        gr.Markdown(f"Language model: {args.model_name_or_path} {'(float16 mode)' if args.load_fp16 else ''}")
+        # Construct state for parameters, define updates and toggles
+        default_prompt = args.__dict__.pop("default_prompt")
+        session_args = gr.State(value=args)
+        with gr.Tab("Generate and Detect"):
+            with gr.Row():
+                prompt = gr.Textbox(label=f"Prompt", interactive=True,lines=10,max_lines=10, value=default_prompt)
+            with gr.Row():
+                generate_btn = gr.Button("Generate")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    output_without_watermark = gr.Textbox(label="Output Without Watermark", interactive=False,lines=14,max_lines=14)
+                with gr.Column(scale=1):
+                    # without_watermark_detection_result = gr.Textbox(label="Detection Result", interactive=False,lines=14,max_lines=14)
+                    without_watermark_detection_result = gr.Dataframe(headers=["Metric", "Value"], interactive=False,row_count=7,col_count=2)
+            with gr.Row():
+                with gr.Column(scale=2):
+                    output_with_watermark = gr.Textbox(label="Output With Watermark", interactive=False,lines=14,max_lines=14)
+                with gr.Column(scale=1):
+                    # with_watermark_detection_result = gr.Textbox(label="Detection Result", interactive=False,lines=14,max_lines=14)
+                    with_watermark_detection_result = gr.Dataframe(headers=["Metric", "Value"],interactive=False,row_count=7,col_count=2)
+            redecoded_input = gr.Textbox(visible=False)
+            truncation_warning = gr.Number(visible=False)
+            def truncate_prompt(redecoded_input, truncation_warning, orig_prompt, args):
+                if truncation_warning:
+                    return redecoded_input + f"\n\n[Prompt was truncated before generation due to length...]", args
+                else:
+                    return orig_prompt, args
+        with gr.Tab("Detector Only"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    detection_input = gr.Textbox(label="Text to Analyze", interactive=True,lines=14,max_lines=14)
+                with gr.Column(scale=1):
+                    # detection_result = gr.Textbox(label="Detection Result", interactive=False,lines=14,max_lines=14)
+                    detection_result = gr.Dataframe(headers=["Metric", "Value"], interactive=False,row_count=7,col_count=2)
+            with gr.Row():
+                    detect_btn = gr.Button("Detect")
+        # Parameter selection group
+        with gr.Accordion("Advanced Settings",open=False):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown(f"#### Generation Parameters")
+                    with gr.Row():
+                        decoding = gr.Radio(label="Decoding Method",choices=["multinomial", "greedy"], value=("multinomial" if args.use_sampling else "greedy"))
+                    with gr.Row():
+                        sampling_temp = gr.Slider(label="Sampling Temperature", minimum=0.1, maximum=1.0, step=0.1, value=args.sampling_temp, visible=True)
+                    with gr.Row():
+                        generation_seed = gr.Number(label="Generation Seed",value=args.generation_seed, interactive=True)
+                    with gr.Row():
+                        n_beams = gr.Dropdown(label="Number of Beams",choices=list(range(1,11,1)), value=args.n_beams, visible=(not args.use_sampling))
+                    with gr.Row():
+                        max_new_tokens = gr.Slider(label="Max Generated Tokens", minimum=10, maximum=1000, step=10, value=args.max_new_tokens)
+                with gr.Column(scale=1):
+                    gr.Markdown(f"#### Watermark Parameters")
+                    with gr.Row():
+                        gamma = gr.Slider(label="gamma",minimum=0.1, maximum=0.9, step=0.05, value=args.gamma)
+                    with gr.Row():
+                        delta = gr.Slider(label="delta",minimum=0.0, maximum=10.0, step=0.1, value=args.delta)
+                    gr.Markdown(f"#### Detector Parameters")
+                    with gr.Row():
+                        detection_z_threshold = gr.Slider(label="z-score threshold",minimum=0.0, maximum=10.0, step=0.1, value=args.detection_z_threshold)
+                    with gr.Row():
+                        ignore_repeated_bigrams = gr.Checkbox(label="Ignore Bigram Repeats")
+                    with gr.Row():
+                        normalizers = gr.CheckboxGroup(label="Normalizations", choices=["unicode", "homoglyphs", "truecase"], value=args.normalizers)
+            # with gr.Accordion("Actual submitted parameters:",open=False):
+            with gr.Row():
+                gr.Markdown(f"_Note: sliders don't always update perfectly. Clicking on the bar or using the number window to the right can help. Window below shows the current settings._")
+            with gr.Row():
+                current_parameters = gr.Textbox(label="Current Parameters", value=args)
+            with gr.Accordion("Legacy Settings",open=False):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        seed_separately = gr.Checkbox(label="Seed both generations separately", value=args.seed_separately)
+                    with gr.Column(scale=1):
+                        select_green_tokens = gr.Checkbox(label="Select 'greenlist' from partition", value=args.select_green_tokens)
+        with gr.Accordion("Understanding the settings",open=False):
+            gr.Markdown(
+            """
+            #### Generation Parameters:
+            - Decoding Method : We can generate tokens from the model using either multinomial sampling or we can generate using greedy decoding.
+            - Sampling Temperature : If using multinomial sampling we can set the temperature of the sampling distribution.
+                                0.0 is equivalent to greedy decoding, and 1.0 is the maximum amount of variability/entropy in the next token distribution.
+                                0.7 strikes a nice balance between faithfulness to the model's estimate of top candidates while adding variety. Does not apply for greedy decoding.
+            - Generation Seed : The integer to pass to the torch random number generator before running generation. Makes the multinomial sampling strategy
+                                outputs reproducible. Does not apply for greedy decoding.
+            - Number of Beams : When using greedy decoding, we can also set the number of beams to > 1 to enable beam search.
+                                This is not implemented/excluded from paper for multinomial sampling but may be added in future.
+            - Max Generated Tokens : The `max_new_tokens` parameter passed to the generation method to stop the output at a certain number of new tokens.
+                                    Note that the model is free to generate fewer tokens depending on the prompt.
+                                    Implicitly this sets the maximum number of prompt tokens possible as the model's maximum input length minus `max_new_tokens`,
+                                    and inputs will be truncated accordingly.
+            #### Watermark Parameters:
+            - gamma : The fraction of the vocabulary to be partitioned into the greenlist at each generation step.
+                     Smaller gamma values create a stronger watermark by enabling the watermarked model to achieve
+                     a greater differentiation from human/unwatermarked text because it is preferentially sampling
+                     from a smaller green set making those tokens less likely to occur by chance.
+            - delta : The amount of positive bias to add to the logits of every token in the greenlist
+                        at each generation step before sampling/choosing the next token. Higher delta values
+                        mean that the greenlist tokens are more heavily preferred by the watermarked model
+                        and as the bias becomes very large the watermark transitions from "soft" to "hard".
+                        For a hard watermark, nearly all tokens are green, but this can have a detrimental effect on
+                        generation quality, especially when there is not a lot of flexibility in the distribution.
+            #### Detector Parameters:
+            - z-score threshold : the z-score cuttoff for the hypothesis test. Higher thresholds (such as 4.0) make
+                                _false positives_ (predicting that human/unwatermarked text is watermarked) very unlikely
+                                as a genuine human text with a significant number of tokens will almost never achieve
+                                that high of a z-score. Lower thresholds will capture more _true positives_ as some watermarked
+                                texts will contain less green tokens and achive a lower z-score, but still pass the lower bar and
+                                be flagged as "watermarked". However, a lowere threshold will increase the chance that human text
+                                that contains a slightly higher than average number of green tokens is erroneously flagged.
+                                4.0-5.0 offers extremely low false positive rates while still accurately catching most watermarked text.
+            - Ignore Bigram Repeats : This alternate detection algorithm only considers the unique bigrams in the text during detection,
+                                    computing the greenlists based on the first in each pair and checking whether the second falls within the list.
+                                    This means that `T` is now the unique number of bigrams in the text, which becomes less than the total
+                                    number of tokens generated if the text contains a lot of repetition. See the paper for a more detailed discussion.
+            - Normalizations : we implement a few basic normaliations to defend against various adversarial perturbations of the
+                                text analyzed during detection. Currently we support converting all chracters to unicode,
+                                replacing homoglyphs with a canonical form, and standardizing the capitalization.
+                                See the paper for a detailed discussion of input normalization.
+            """
+            )
+        gr.HTML("""
+                <p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.
+                    Follow the github link at the top and host the demo on your own GPU hardware to test out larger models.
+                <br/>
+                <a href="https://huggingface.co/spaces/tomg-group-umd/lm-watermarking?duplicate=true">
+                <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
+                <p/>
+                """)
+        # Register main generation tab click, outputing generations as well as a the encoded+redecoded+potentially truncated prompt and flag
+        generate_btn.click(fn=generate_partial, inputs=[prompt,session_args], outputs=[redecoded_input, truncation_warning, output_without_watermark, output_with_watermark,session_args])
+        # Show truncated version of prompt if truncation occurred
+        redecoded_input.change(fn=truncate_prompt, inputs=[redecoded_input,truncation_warning,prompt,session_args], outputs=[prompt,session_args])
+        # Call detection when the outputs (of the generate function) are updated
+        output_without_watermark.change(fn=detect_partial, inputs=[output_without_watermark,session_args], outputs=[without_watermark_detection_result,session_args])
+        output_with_watermark.change(fn=detect_partial, inputs=[output_with_watermark,session_args], outputs=[with_watermark_detection_result,session_args])
+        # Register main detection tab click
+        detect_btn.click(fn=detect_partial, inputs=[detection_input,session_args], outputs=[detection_result, session_args])
+        # State management logic
+        # update callbacks that change the state dict
+        def update_sampling_temp(session_state, value): session_state.sampling_temp = float(value); return session_state
+        def update_generation_seed(session_state, value): session_state.generation_seed = int(value); return session_state
+        def update_gamma(session_state, value): session_state.gamma = float(value); return session_state
+        def update_delta(session_state, value): session_state.delta = float(value); return session_state
+        def update_detection_z_threshold(session_state, value): session_state.detection_z_threshold = float(value); return session_state
+        def update_decoding(session_state, value):
+            if value == "multinomial":
+                session_state.use_sampling = True
+            elif value == "greedy":
+                session_state.use_sampling = False
+            return session_state
+        def toggle_sampling_vis(value):
+            if value == "multinomial":
+                return gr.update(visible=True)
+            elif value == "greedy":
+                return gr.update(visible=False)
+        def toggle_sampling_vis_inv(value):
+            if value == "multinomial":
+                return gr.update(visible=False)
+            elif value == "greedy":
+                return gr.update(visible=True)
+        def update_n_beams(session_state, value): session_state.n_beams = value; return session_state
+        def update_max_new_tokens(session_state, value): session_state.max_new_tokens = int(value); return session_state
+        def update_ignore_repeated_bigrams(session_state, value): session_state.ignore_repeated_bigrams = value; return session_state
+        def update_normalizers(session_state, value): session_state.normalizers = value; return session_state
+        def update_seed_separately(session_state, value): session_state.seed_separately = value; return session_state
+        def update_select_green_tokens(session_state, value): session_state.select_green_tokens = value; return session_state
+        # registering callbacks for toggling the visibilty of certain parameters
+        decoding.change(toggle_sampling_vis,inputs=[decoding], outputs=[sampling_temp])
+        decoding.change(toggle_sampling_vis,inputs=[decoding], outputs=[generation_seed])
+        decoding.change(toggle_sampling_vis_inv,inputs=[decoding], outputs=[n_beams])
+        # registering all state update callbacks
+        decoding.change(update_decoding,inputs=[session_args, decoding], outputs=[session_args])
+        sampling_temp.change(update_sampling_temp,inputs=[session_args, sampling_temp], outputs=[session_args])
+        generation_seed.change(update_generation_seed,inputs=[session_args, generation_seed], outputs=[session_args])
+        n_beams.change(update_n_beams,inputs=[session_args, n_beams], outputs=[session_args])
+        max_new_tokens.change(update_max_new_tokens,inputs=[session_args, max_new_tokens], outputs=[session_args])
+        gamma.change(update_gamma,inputs=[session_args, gamma], outputs=[session_args])
+        delta.change(update_delta,inputs=[session_args, delta], outputs=[session_args])
+        detection_z_threshold.change(update_detection_z_threshold,inputs=[session_args, detection_z_threshold], outputs=[session_args])
+        ignore_repeated_bigrams.change(update_ignore_repeated_bigrams,inputs=[session_args, ignore_repeated_bigrams], outputs=[session_args])
+        normalizers.change(update_normalizers,inputs=[session_args, normalizers], outputs=[session_args])
+        seed_separately.change(update_seed_separately,inputs=[session_args, seed_separately], outputs=[session_args])
+        select_green_tokens.change(update_select_green_tokens,inputs=[session_args, select_green_tokens], outputs=[session_args])
+        # register additional callback on button clicks that updates the shown parameters window
+        generate_btn.click(lambda value: str(value), inputs=[session_args], outputs=[current_parameters])
+        detect_btn.click(lambda value: str(value), inputs=[session_args], outputs=[current_parameters])
+        # When the parameters change, display the update and fire detection, since some detection params dont change the model output.
+        gamma.change(lambda value: str(value), inputs=[session_args], outputs=[current_parameters])
+        gamma.change(fn=detect_partial, inputs=[output_without_watermark,session_args], outputs=[without_watermark_detection_result,session_args])
+        gamma.change(fn=detect_partial, inputs=[output_with_watermark,session_args], outputs=[with_watermark_detection_result,session_args])
+        gamma.change(fn=detect_partial, inputs=[detection_input,session_args], outputs=[detection_result,session_args])
+        detection_z_threshold.change(lambda value: str(value), inputs=[session_args], outputs=[current_parameters])
+        detection_z_threshold.change(fn=detect_partial, inputs=[output_without_watermark,session_args], outputs=[without_watermark_detection_result,session_args])
+        detection_z_threshold.change(fn=detect_partial, inputs=[output_with_watermark,session_args], outputs=[with_watermark_detection_result,session_args])
+        detection_z_threshold.change(fn=detect_partial, inputs=[detection_input,session_args], outputs=[detection_result,session_args])
+        ignore_repeated_bigrams.change(lambda value: str(value), inputs=[session_args], outputs=[current_parameters])
+        ignore_repeated_bigrams.change(fn=detect_partial, inputs=[output_without_watermark,session_args], outputs=[without_watermark_detection_result,session_args])
+        ignore_repeated_bigrams.change(fn=detect_partial, inputs=[output_with_watermark,session_args], outputs=[with_watermark_detection_result,session_args])
+        ignore_repeated_bigrams.change(fn=detect_partial, inputs=[detection_input,session_args], outputs=[detection_result,session_args])
+        normalizers.change(lambda value: str(value), inputs=[session_args], outputs=[current_parameters])
+        normalizers.change(fn=detect_partial, inputs=[output_without_watermark,session_args], outputs=[without_watermark_detection_result,session_args])
+        normalizers.change(fn=detect_partial, inputs=[output_with_watermark,session_args], outputs=[with_watermark_detection_result,session_args])
+        normalizers.change(fn=detect_partial, inputs=[detection_input,session_args], outputs=[detection_result,session_args])
+        select_green_tokens.change(lambda value: str(value), inputs=[session_args], outputs=[current_parameters])
+        select_green_tokens.change(fn=detect_partial, inputs=[output_without_watermark,session_args], outputs=[without_watermark_detection_result,session_args])
+        select_green_tokens.change(fn=detect_partial, inputs=[output_with_watermark,session_args], outputs=[with_watermark_detection_result,session_args])
+        select_green_tokens.change(fn=detect_partial, inputs=[detection_input,session_args], outputs=[detection_result,session_args])
+    demo.queue(concurrency_count=3)
+    if args.demo_public:
+        demo.launch(share=True) # exposes app to the internet via randomly generated link
+    else:
+        demo.launch()
+def main(args):
+    """Run a command line version of the generation and detection operations
+        and optionally launch and serve the gradio demo"""
+    # Initial arg processing and log
+    args.normalizers = (args.normalizers.split(",") if args.normalizers else [])
+    print(args)
+    if not args.skip_model_load:
+        model, tokenizer, device = load_model(args)
+    else:
+        model, tokenizer, device = None, None, None
+    # Generate and detect, report to stdout
+    if not args.skip_model_load:
+        input_text = (
+        "The diamondback terrapin or simply terrapin (Malaclemys terrapin) is a "
+        "species of turtle native to the brackish coastal tidal marshes of the "
+        "Northeastern and southern United States, and in Bermuda.[6] It belongs "
+        "to the monotypic genus Malaclemys. It has one of the largest ranges of "
+        "all turtles in North America, stretching as far south as the Florida Keys "
+        "and as far north as Cape Cod.[7] The name 'terrapin' is derived from the "
+        "Algonquian word torope.[8] It applies to Malaclemys terrapin in both "
+        "British English and American English. The name originally was used by "
+        "early European settlers in North America to describe these brackish-water "
+        "turtles that inhabited neither freshwater habitats nor the sea. It retains "
+        "this primary meaning in American English.[8] In British English, however, "
+        "other semi-aquatic turtle species, such as the red-eared slider, might "
+        "also be called terrapins. The common name refers to the diamond pattern "
+        "on top of its shell (carapace), but the overall pattern and coloration "
+        "vary greatly. The shell is usually wider at the back than in the front, "
+        "and from above it appears wedge-shaped. The shell coloring can vary "
+        "from brown to grey, and its body color can be grey, brown, yellow, "
+        "or white. All have a unique pattern of wiggly, black markings or spots "
+        "on their body and head. The diamondback terrapin has large webbed "
+        "feet.[9] The species is"
+        )
+        args.default_prompt = input_text
+        term_width = 80
+        print("#"*term_width)
+        print("Prompt:")
+        print(input_text)
+        _, _, decoded_output_without_watermark, decoded_output_with_watermark, _ = generate(input_text,
+                                                                                            args,
+                                                                                            model=model,
+                                                                                            device=device,
+                                                                                            tokenizer=tokenizer)
+        without_watermark_detection_result = detect(decoded_output_without_watermark,
+                                                    args,
+                                                    device=device,
+                                                    tokenizer=tokenizer)
+        with_watermark_detection_result = detect(decoded_output_with_watermark,
+                                                 args,
+                                                 device=device,
+                                                 tokenizer=tokenizer)
+        print("#"*term_width)
+        print("Output without watermark:")
+        print(decoded_output_without_watermark)
+        print("-"*term_width)
+        print(f"Detection result @ {args.detection_z_threshold}:")
+        pprint(without_watermark_detection_result)
+        print("-"*term_width)
+        print("#"*term_width)
+        print("Output with watermark:")
+        print(decoded_output_with_watermark)
+        print("-"*term_width)
+        print(f"Detection result @ {args.detection_z_threshold}:")
+        pprint(with_watermark_detection_result)
+        print("-"*term_width)
+    # Launch the app to generate and detect interactively (implements the hf space demo)
+    if args.run_gradio:
+        run_gradio(args, model=model, tokenizer=tokenizer, device=device)
+    return
+if __name__ == "__main__":
+    args = parse_args()
+    print(args)
+    main(args)

lm-watermarking-main/experiments/README.md ADDED Viewed

	@@ -0,0 +1,91 @@

+## [Stale/Deprecated] Experimental Pipeline Code
+This subdirectory contains reproducibility artifacts for the experiments described in the paper. All code here is deprecated in favor of the implementation and demo in the root of the repository.
+In effect, the file `/watermark_processor.py` in the root of the repo, is a clean, user friendly reimplementation of the watermarking and detection logic from `watermark.py`. We suggest using the official release version over any code found in the `experiments` directory.
+## Overview
+Unless stated, all files discussed here are in the `experiments` directory. The `bl` naming convention across many variables and function definition refers to "blacklist". Black/white was the original language used in the development of the paper and was updated to green/red based on feed back from the community.
+The implementation for the main experiments in the paper have two high level steps:
+- **(1) generate watermarked samples**
+- **(2) compute metrics**
+The code provided here implements these steps in the following files: `run_watermarking.py` and `process_rows.py`, where the core logic is implemented in `watermark.py` a single file library.
+Generally speaking, the code implementing the watermark itself is a series of classes and functions based on the `LogitsProcessor` abstraction from [huggingface/transformers](https://github.com/huggingface/transformers) and the code that turns it into a workflow is based on the `dataset.map` functionality from [huggingface/datasets](https://github.com/huggingface/datasets).
+The files `io_utils.py`, `submitit_utils.py` and `launch.py` contain utilites for file operations (mostly `jsonl`) and for hyperparameter sweeping via jobs launched on our compute cluster (managed using [SLURM](https://slurm.schedmd.com/documentation.html)). The [`submitit`](https://github.com/facebookincubator/submitit) workflow tool is an extra dependency only required if using `launch.py`.
+## Generation (`run_watermarking.py`)
+`run_watermarking.py` is a command line script that:
+1. loads a huggingface `dataset` that will be used to create text prompts for the language model
+2. loads a huggingface language model that can perform text generation via `model.generate`, and prepares to call the generation method with a special `LogitsProcessor` that implements watermarking at the current hyperparameter values
+3. composes a series of functions that are applied to the dataset via `map` that preprocess and tokenize the prompt data, and generate completions to it via the model
+4. loads a second huggingface language model to be used as perplexity "oracle" for evaluating the quality of the texts generated by the watermarked model
+5. Computes the teacher-forced loss (and perplexity) of the oracle model on the generated outputs
+Here is an example of the argument set required to implement a single (representative) hyperparameter combination from the paper:
+```
+python run_watermarking.py \
+    --model_name facebook/opt-1.3b \
+    --dataset_name c4 \
+    --dataset_config_name realnewslike \
+    --max_new_tokens 200 \
+    --min_prompt_tokens 50 \
+    --limit_indices 500 \
+    --input_truncation_strategy completion_length \
+    --input_filtering_strategy prompt_and_completion_length \
+    --output_filtering_strategy max_new_tokens \
+    --dynamic_seed markov_1 \
+    --bl_proportion 0.5 \
+    --bl_logit_bias 2.0 \
+    --bl_type soft \
+    --store_spike_ents True \
+    --num_beams 1 \
+    --use_sampling True \
+    --sampling_temp 0.7
+    --oracle_model_name facebook/opt-2.7b \
+    --run_name example_run \
+    --output_dir ./all_runs \
+```
+The result of each run is a directory with three files in it:
+- `gen_table_meta.json` (hyperparameters passed from cmdline)
+- `gen_table.jsonl`
+- `gen_table_w_metrics.jsonl`
+`gen_table_w_metrics`="generation table with metrics" meaning that it is the same as the first `jsonl` file in the lines/row dimension, but contains more columns/features, such as perplexity.
+If you run multiple hyperparameter combinations, we suggest storing each of the run directories with those output files within one enclosing directory such as `all_runs` to facilitate the next step.
+## Computing Metrics (`process_rows.py`)
+.. and merging hyperparameter runs by concatenation.
+After running a few combinations of hyperparameters (individual runs of the `run_watermarking.py` script), the result is a bunch of directories, each containing a file full of model outputs (`gen_table_w_metrics.jsonl`).
+To prepare to analyze the performance of the watermark, we enrich each one of these generation sets with more metrics and derived features. The script that accomplishes this is `process_rows.py` - each prompt, output pair is considered a "row".
+The script isn't fully command line parameterized, but inside you can see that the main method looks into a directory (such as the `all_runs` suggested above) and collects all of the sub dirs that contain `gen_table_w_metrics.jsonl` files. Each set of generations is reloaded from `jsonl` into a huggingface `Dataset` object so that a metric computation function `compute_bl_metrics` can be applied to it.
+This adds the critical fields like `w_bl_whitelist_fraction` which represent the raw measurement of the watermark presence. In the final analysis step, this is used compute a z-score and perform the detection hypothesis test.
+**_Note_**: to clarify explicitly, `compute_bl_metrics` is therefore the old "detection" step of the pipeline. In this earlier version, there was no dedicated sub/class structure to share the logic of the watermark between a generation object and a detector object. It was just located within the `score_sequence` function of the `watermark.py` file.
+The final step in `process_rows.py` is a concatenation of these results. Each `gen_table_w_metrics.jsonl` from a hyperparameter run (within an `all_runs`) is transformed into a new dataset with the watermark detection measurement, and then all of these dataset objects are concatenated in the row dimension, forming one large dataset that has the generations and metrics from all of the different hyperparameter settings that were run.
+This object is shaped like (rows,columns) where samples=rows, and features=columns, and for the paper it had a size ~ (3e4,25) since there were about 30 to 40 hyperparameter settings and between 500-1000 generations per setting. Huggingface datasets conveniently implements a `dataset.to_pandas()` function and this allows us to treat this result as a dataframe and slice and dice it however we like during the analysis phase.
+## Analysis
+The result of the above steps is a somewhat standard "datascience" format, a `pandas.DataFrame` and we suggest that you analyze it in whatever way you see fit. Since this part was very interactive and exploratory, there isn't a stable script version of this stage.
+That said, the analysis code is in a notebook called `watermarking_analysis.ipynb`. Unfortunately, this notebook is monolithic. Pointers have been indicated as to which parts produce which figures. However, at this time, there is not a way to click once/run all and generate every chart and table from the paper.
+A second notebook `watermarking_example_finding.ipynb` is solely for extracting some actual text prompts and outputs for tabulation in the paper.

lm-watermarking-main/experiments/io_utils.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+import glob
+import json
+import logging
+from typing import Any, Mapping, Iterable, Union, List, Callable, Optional
+from tqdm.auto import tqdm
+def resolve_globs(glob_paths: Union[str, Iterable[str]]):
+    """Returns filepaths corresponding to input filepath pattern(s)."""
+    filepaths = []
+    if isinstance(glob_paths, str):
+        glob_paths = [glob_paths]
+    for path in glob_paths:
+        filepaths.extend(glob.glob(path))
+    return filepaths
+def read_jsonlines(filename: str) -> Iterable[Mapping[str, Any]]:
+    """Yields an iterable of Python dicts after reading jsonlines from the input file."""
+    file_size = os.path.getsize(filename)
+    with open(filename) as fp:
+        for line in tqdm(fp.readlines(), desc=f'Reading JSON lines from {filename}', unit='lines'):
+            try:
+                example = json.loads(line)
+                yield example
+            except json.JSONDecodeError as ex:
+                logging.error(f'Input text: "{line}"')
+                logging.error(ex.args)
+                raise ex
+def hf_read_jsonlines(filename: str,
+                   n: Optional[int]=None,
+                   minimal_questions: Optional[bool]=False,
+                   unique_questions: Optional[bool] = False) -> Iterable[Mapping[str, Any]]:
+    """Yields an iterable of Python dicts after reading jsonlines from the input file.
+       Optionally reads only first n lines from file."""
+    file_size = os.path.getsize(filename)
+    # O(n) but no memory
+    with open(filename) as f:
+        num_lines= sum(1 for _ in f)
+        if n is None:
+            n = num_lines
+    # returning a generator with the scope stmt seemed to be the issue, but I am not 100% sure
+    # I also don't know if there's a side effect, but I can't see how the scope wouldn't have
+    # remained upen in the first place with the original version...
+    # with open(filename) as fp:
+    def line_generator():
+        unique_qc_ids = set()
+        # note, I am p sure that readlines is not lazy, returns a list, thus really only the
+        # object conversion is lazy
+        for i, line in tqdm(enumerate(open(filename).readlines()[:n]), desc=f'Reading JSON lines from {filename}', unit='lines'):
+            try:
+                full_example = json.loads(line)
+                if unique_questions:
+                    qc_id = full_example["object"]["qc_id"]
+                    if qc_id in unique_qc_ids:
+                        continue
+                    else:
+                        unique_qc_ids.add(qc_id)
+                if not minimal_questions:
+                    example = full_example
+                else:
+                    full_example = full_example
+                    q_object = full_example["object"]
+                    q_object.pop("question_info")
+                    example= {}
+                    example["object"] = {
+                        "answer":q_object["answer"],
+                        "clue_spans":q_object["clue_spans"],
+                        "qc_id":q_object["qc_id"],
+                        "question_text":q_object["question_text"],
+                    }
+                yield example
+            except json.JSONDecodeError as ex:
+                logging.error(f'Input text: "{line}"')
+                logging.error(ex.args)
+                raise ex
+    return line_generator
+def load_jsonlines(filename: str) -> List[Mapping[str, Any]]:
+    """Returns a list of Python dicts after reading jsonlines from the input file."""
+    return list(read_jsonlines(filename))
+def write_jsonlines(objs: Iterable[Mapping[str, Any]], filename: str, to_dict: Callable = lambda x: x):
+    """Writes a list of Python Mappings as jsonlines at the input file."""
+    with open(filename, 'w') as fp:
+        for obj in tqdm(objs, desc=f'Writing JSON lines at {filename}'):
+            fp.write(json.dumps(to_dict(obj)))
+            fp.write('\n')
+def read_json(filename: str) -> Mapping[str, Any]:
+    """Returns a Python dict representation of JSON object at input file."""
+    with open(filename) as fp:
+        return json.load(fp)
+def write_json(obj: Mapping[str, Any], filename: str, indent:int=None):
+    """Writes a Python Mapping at the input file in JSON format."""
+    with open(filename, 'w') as fp:
+        json.dump(obj, fp, indent=indent)
+def print_json(d, indent=4):
+    print(json.dumps(d, indent=indent))

lm-watermarking-main/experiments/launch.py ADDED Viewed

	@@ -0,0 +1,222 @@

+from submitit import AutoExecutor
+from submitit.helpers import CommandFunction
+from itertools import chain
+import os
+from submitit_utils import ParameterGrid
+import argparse
+# a debug/dry-run command
+dummy_func = CommandFunction(["echo"], verbose=True)
+###############################################################################
+# Experiment specific command and parameter setup
+# (the structure is general, but the values are not)
+###############################################################################
+base_run_name = None
+ROOT_DIR = f'{os.getenv("ROOT_DIR")}'
+# OUTPUT_DIR = f'{os.getenv("OUTPUT_DIR")}'
+# OUTPUT_DIR = f'{os.getenv("OUTPUT_DIR")}_large_sweep'
+# OUTPUT_DIR = f'{os.getenv("OUTPUT_DIR")}_large_sweep_downsize'
+# OUTPUT_DIR = f'{os.getenv("OUTPUT_DIR")}_greedy_redo'
+OUTPUT_DIR = f'{os.getenv("OUTPUT_DIR")}_greedy_more_gammas'
+# starting command/program to which we will append arguments
+cmdline_function = CommandFunction(["python"], verbose=True)
+# script name
+script_name = "run_watermarking.py"
+# base args
+base_script_args = {
+    # "model_name"         :"facebook/opt-2.7b",
+    "model_name"         :"facebook/opt-1.3b",
+    "dataset_name"       :"c4",
+    "dataset_config_name":"realnewslike",
+    # "dataset_config_name":"en",
+    # "dataset_name": "cml_pile",
+    # "dataset_config_name": "all_train_00",
+    # "shuffle_dataset"    :"True", # NOTE
+    "dynamic_seed"       :"markov_1",
+    "store_spike_ents"   :"True",
+    # "oracle_model_name"  :"EleutherAI/gpt-j-6B",
+    "oracle_model_name"  :"facebook/opt-2.7b",
+    "no_wandb"           :"False",
+}
+# dynamic/hparam args
+# i.e. the parameters we would like to cross and sweep over
+hparam_sets = [
+    # # main sampling sweep, central data
+    # {
+    #     "min_prompt_tokens": [50],
+    #     "max_new_tokens": [200],
+    #     "input_truncation_strategy": ["completion_length"],
+    #     "input_filtering_strategy": ["prompt_and_completion_length"],
+    #     "output_filtering_strategy": ["max_new_tokens"],
+    #     "limit_indices": [500],
+    #     "bl_logit_bias": [0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 50.0],
+    #     "bl_proportion": [0.1, 0.25, 0.5, 0.75, 0.9],
+    #     "bl_type": ["soft"],
+    #     "num_beams": [1],
+    #     "use_sampling": [True],
+    #     "sampling_temp": [0.7],
+    # },
+    # greedy and beams secondary demos
+    # {
+    #     "min_sample_tokens":[0],
+    #     "min_prompt_tokens": [200],
+    #     "max_new_tokens": [500],
+    #     "all_gas_no_eos": [True],
+    #     "input_truncation_strategy": ["prompt_length"],
+    #     "input_filtering_strategy": ["prompt_and_completion_length"],
+    #     "output_filtering_strategy": ["no_filter"],
+    #     "limit_indices": [500],
+    #     "bl_logit_bias": [0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
+    #     "bl_proportion": [0.5],
+    #     "bl_type": ["soft"],
+    #     "num_beams": [1],
+    #     "use_sampling": [False],
+    #     "sampling_temp": [0.0],
+    # },
+    # {
+    #     "min_sample_tokens":[0],
+    #     "min_prompt_tokens": [200],
+    #     "max_new_tokens": [500],
+    #     "all_gas_no_eos": [True],
+    #     "no_repeat_ngram_size": [0],
+    #     "input_truncation_strategy": ["prompt_length"],
+    #     "input_filtering_strategy": ["prompt_and_completion_length"],
+    #     "output_filtering_strategy": ["no_filter"],
+    #     "limit_indices": [500],
+    #     "bl_logit_bias": [0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
+    #     "bl_proportion": [0.5],
+    #     "bl_type": ["soft"],
+    #     "num_beams": [4],
+    #     "use_sampling": [False],
+    #     "sampling_temp": [0.0],
+    # },
+    {
+        "min_sample_tokens":[0],
+        "min_prompt_tokens": [200],
+        "max_new_tokens": [500],
+        "all_gas_no_eos": [True],
+        "no_repeat_ngram_size": [0],
+        "input_truncation_strategy": ["prompt_length"],
+        "input_filtering_strategy": ["prompt_and_completion_length"],
+        "output_filtering_strategy": ["no_filter"],
+        "limit_indices": [500],
+        "bl_logit_bias": [0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
+        # "bl_logit_bias": [2.0, 5.0, 10.0],
+        # "bl_proportion": [0.5],
+        # "bl_proportion": [0.75],
+        "bl_proportion": [0.9],
+        "bl_type": ["soft"],
+        "num_beams": [8],
+        "use_sampling": [False],
+        "sampling_temp": [0.0],
+    },
+    ############
+]
+# logic to set derived arguments based on existing arguments in the sweep sets
+# the unique run name is the canonical example
+def add_conditional_params(param_dict):
+    # unique_name = f'{base_run_name+"_" if base_run_name else ""}{param_dict.get("model_name")}_{param_dict.get("dataset_name")}_{param_dict.get("dataset_config_name")}'
+    unique_name_keys = ["model_name",
+                        "bl_type",
+                        "dynamic_seed",
+                        "bl_proportion",
+                        "bl_logit_bias",
+                        "num_beams",
+                        "use_sampling",
+                        "sampling_temp",
+                        "dataset_name",
+                        "dataset_config_name",
+                        "min_prompt_tokens",
+                        "max_new_tokens",
+                        "input_truncation_strategy",
+                        "input_filtering_strategy",
+                        "output_filtering_strategy",
+                        "limit_indices",
+                        "oracle_model_name"]
+    unique_name = f'{base_run_name+"_" if base_run_name else ""}{"_".join([str(param_dict.get(k)) for k in unique_name_keys])}'
+    unique_name = unique_name.replace("/", "-").replace(".","-")
+    param_dict.update({"run_name": unique_name})
+    param_dict.update({"output_dir": f'{OUTPUT_DIR}/{param_dict["run_name"]}'})
+# Queue up all the arguments
+def add_params(param_dicts):
+    new_dicts = []
+    for i, param_dict in enumerate(param_dicts):
+        new_dict = {}
+        new_dict.update({script_name : ""}) # This requires parse block change in submitit.core.utils.py L320
+        new_dict.update(base_script_args)
+        new_dict.update(param_dict)
+        add_conditional_params(new_dict)
+        new_dicts.append(new_dict)
+    return new_dicts
+###############################################################################
+# Generic submitit and slurm workflow
+###############################################################################
+# set up the executor and sbatch settings
+# executor = AutoExecutor(cluster='slurm', folder=f'{ROOT_DIR}/logs/')
+# executor = AutoExecutor(cluster='slurm', folder=f'{ROOT_DIR}/logs_large_sweep/')
+# executor = AutoExecutor(cluster='slurm', folder=f'{ROOT_DIR}/logs_large_sweep_downsize/')
+# executor = AutoExecutor(cluster='slurm', folder=f'{ROOT_DIR}/logs_greedy_redo/')
+executor = AutoExecutor(cluster='slurm', folder=f'{ROOT_DIR}/logs_greedy_more_gammas/')
+executor.update_parameters(
+    stderr_to_stdout=True,
+    slurm_name='water',
+    # slurm_account='tomg',
+    # slurm_qos='very_high',
+    # slurm_qos='high',
+    slurm_mem= '52gb',
+    slurm_gres='gpu:rtxa6000:1',
+    slurm_time='14:00:00',
+    slurm_account='scavenger',
+    slurm_partition='scavenger',
+    slurm_qos='scavenger',
+    # slurm_mem= '32gb',
+    # slurm_cpus_per_task=4,
+    # slurm_gres='gpu:rtxa5000:1',
+    # slurm_time='12:00:00',
+)
+# cross and line up parameter combinations
+arg_dicts = list(chain(*(ParameterGrid(p_set) for p_set in hparam_sets)))
+# set params and apply any extra param logic
+arg_dicts = add_params(arg_dicts)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-d", "--dry_run",
+        action="store_true",
+        help="just echo the commands to be run",
+    )
+    args = parser.parse_args()
+    # context to make this loop/list comp execute an array job
+    # rather than individual jobs
+    with executor.batch():
+        if args.dry_run:
+            fn = dummy_func
+        else:
+            fn = cmdline_function
+        jobs = [executor.submit(fn, **arg_dict) for arg_dict in arg_dicts]
+    for job,args in zip(jobs, arg_dicts):
+        print(f"Job={job} | uid={args['run_name']}")

lm-watermarking-main/experiments/process_rows.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# Basic imports
+import os
+from functools import partial
+from argparse import Namespace
+import numpy as np
+# HF classses
+from transformers import AutoTokenizer
+from datasets import Dataset, concatenate_datasets
+# watermarking micro lib
+from watermark import (BlacklistLogitsProcessor,
+                       compute_bl_metrics)
+# some file i/o helpers
+from io_utils import read_jsonlines, read_json
+from watermark import compute_bl_metrics, BlacklistLogitsProcessor
+###########################################################################
+# Compute E[wl] for each example
+###########################################################################
+def expected_whitelist(example,
+                           idx,
+                           exp_wl_coef: float == None,
+                           drop_spike_entropies: bool = False):
+    assert "spike_entropies" in example, "Need to construct bl processor with store_spike_ents=True to compute them in post"
+    num_toks_gend = example["w_bl_num_tokens_generated"]
+    avg_spike_ent = np.mean(example["spike_entropies"])
+    example.update({"avg_spike_entropy":avg_spike_ent})
+    if drop_spike_entropies: del example["spike_entropies"]
+    exp_num_wl = (exp_wl_coef*num_toks_gend)*avg_spike_ent
+    var_num_wl = num_toks_gend*exp_wl_coef*avg_spike_ent*(1-(exp_wl_coef*avg_spike_ent))
+    example.update({"w_bl_exp_num_wl_tokens":exp_num_wl})
+    example.update({"w_bl_var_num_wl_tokens":var_num_wl})
+    example.update({"exp_wl_coef":exp_wl_coef})
+    if num_toks_gend > 0:
+        example.update({"w_bl_exp_whitelist_fraction":exp_num_wl/num_toks_gend,
+                        "w_bl_var_whitelist_fraction":var_num_wl/num_toks_gend})
+    else:
+        example.update({"w_bl_exp_whitelist_fraction":-1,
+                        "w_bl_var_whitelist_fraction":-1})
+    return example
+from typing import Callable
+def add_metadata(ex, meta_table=None):
+    ex.update(meta_table)
+    return ex
+def str_replace_bug_check(example,idx):
+    baseline_before = example["baseline_completion"]
+    example["baseline_completion"] = baseline_before.replace(example["truncated_input"][:-1],"")
+    if example["baseline_completion"] != baseline_before:
+        print("baseline input replacement bug occurred, skipping row!")
+        return False
+    else:
+        return True
+def load_all_datasets(run_names: list[str]=None,
+                      base_run_dir: str=None,
+                      meta_name: str=None,
+                      gen_name: str=None,
+                      apply_metric_func: bool=False,
+                      convert_to_pandas: bool = False,
+                      drop_buggy_rows: bool = False,
+                      limit_output_tokens: int = 0,
+                      save_ds: bool = True,
+                      save_dir: str=None):
+    print(f"Loading {len(run_names)} datasets from {base_run_dir}...")
+    if not isinstance(gen_name, Callable):
+        file_check = lambda name: os.path.exists(f"{base_run_dir}/{name}/{gen_name}")
+        assert all([file_check(name) for name in run_names]), f"Make sure all the run dirs contain the required data files: {meta_name} and {gen_name}"
+    all_datasets = []
+    for i,run_name in enumerate(run_names):
+        print(f"[{i}] Loading dataset")
+        run_base_dir = f"{base_run_dir}/{run_name}"
+        gen_table_meta_path = f"{run_base_dir}/{meta_name}"
+        if isinstance(gen_name, Callable):
+            gen_table_path = f"{run_base_dir}/{gen_name(run_name)}"
+        else:
+            gen_table_path = f"{run_base_dir}/{gen_name}"
+        # load the raw files
+        gen_table_meta = read_json(gen_table_meta_path)
+        gen_table_lst = [ex for ex in read_jsonlines(gen_table_path)]
+        gen_table_ds = Dataset.from_list(gen_table_lst)
+        print(f"Original dataset length={len(gen_table_ds)}")
+        # drop the rows where the string replace thing happens
+        if drop_buggy_rows:
+            gen_table_ds_filtered = gen_table_ds.filter(str_replace_bug_check,batched=False,with_indices=True)
+        else:
+            gen_table_ds_filtered = gen_table_ds
+        # enrich all rows with the run metadata
+        add_meta = partial(
+            add_metadata,
+            meta_table=gen_table_meta
+        )
+        gen_table_w_meta = gen_table_ds_filtered.map(add_meta, batched=False)
+        # optionally, apply the metric function(s) - somewhat expensive
+        # want to do this here rather than at end because you need each run's tokenizer
+        # though tbh it would be odd if they're not the same, but you can check that at the end
+        if apply_metric_func:
+            tokenizer = AutoTokenizer.from_pretrained(gen_table_meta["model_name"])
+            comp_bl_metrics = partial(
+                compute_bl_metrics,
+                tokenizer=tokenizer,
+                hf_model_name=gen_table_meta["model_name"],
+                initial_seed=gen_table_meta["initial_seed"],
+                dynamic_seed=gen_table_meta["dynamic_seed"],
+                bl_proportion=gen_table_meta["bl_proportion"],
+                use_cuda=True, # this is obvi critical to match the pseudorandomness
+                record_hits=True,
+                limit_output_tokens=limit_output_tokens,
+            )
+            gen_table_w_bl_metrics = gen_table_w_meta.map(comp_bl_metrics, batched=False, with_indices=True)
+            # Construct the blacklist processor so you can get the expectation coef
+            all_token_ids = list(tokenizer.get_vocab().values())
+            vocab_size = len(all_token_ids)
+            args = Namespace()
+            args.__dict__.update(gen_table_meta)
+            bl_processor = BlacklistLogitsProcessor(bad_words_ids=None,
+                                                    store_bl_ids=False,
+                                                    store_spike_ents=True,
+                                                    eos_token_id=tokenizer.eos_token_id,
+                                                    vocab=all_token_ids,
+                                                    vocab_size=vocab_size,
+                                                    bl_proportion=args.bl_proportion,
+                                                    bl_logit_bias=args.bl_logit_bias,
+                                                    bl_type=args.bl_type,
+                                                    initial_seed= args.initial_seed,
+                                                    dynamic_seed=args.dynamic_seed)
+            if "spike_entropies" in gen_table_w_bl_metrics.column_names:
+                comp_exp_num_wl = partial(
+                    expected_whitelist,
+                    exp_wl_coef=bl_processor.expected_wl_coef,
+                    drop_spike_entropies=False,
+                    # drop_spike_entropies=True,
+                )
+                gen_table_w_spike_ents = gen_table_w_bl_metrics.map(comp_exp_num_wl, batched=False, with_indices=True)
+                final_single_run_ds = gen_table_w_spike_ents
+            else:
+                final_single_run_ds = gen_table_w_bl_metrics
+        else:
+            final_single_run_ds = gen_table_w_meta
+        all_datasets.append(final_single_run_ds)
+    ds = concatenate_datasets(all_datasets)
+    if save_ds:
+        ds.save_to_disk(save_dir)
+    if convert_to_pandas:
+        df = ds.to_pandas()
+        return df
+    else:
+        return ds
+output_dir = "/cmlscratch/jkirchen/spiking-root/lm-blacklisting/output_large_sweep"
+# output_dir = "/cmlscratch/jkirchen/spiking-root/lm-blacklisting/output_large_sweep_downsize"
+# output_dir = "/cmlscratch/jkirchen/spiking-root/lm-blacklisting/output_large_sweep_downsize"
+# output_dir = "/cmlscratch/jkirchen/spiking-root/lm-blacklisting/output_greedy_redo"
+# output_dir = "/cmlscratch/jkirchen/spiking-root/lm-blacklisting/output_greedy_gamma_0-25"
+run_names = list(filter(lambda name: os.path.exists(f"{output_dir}/{name}/gen_table_w_metrics.jsonl"), sorted(os.listdir(output_dir))))
+run_names = list(filter(lambda name: "realnewslike" in name, run_names))
+# run_names = list(filter(lambda name: "pile" in name, run_names))
+# run_names = list(filter(lambda name: "c4_en" in name, run_names))
+# output_dir = "/cmlscratch/jkirchen/spiking-root/lm-blacklisting/output_attacked_greedy_updated"
+# # output_dir = "/cmlscratch/jkirchen/spiking-root/lm-blacklisting/output_attacked_new"
+# run_names = list(filter(lambda name: os.path.exists(f"{output_dir}/{name}/gen_table_w{('_'+name) if 't5' in name else ''}_attack_metrics.jsonl"), sorted(os.listdir(output_dir))))
+# run_names = list(filter(lambda name: os.path.exists(f"{output_dir}/{name}/gen_table_w_attack_metrics.jsonl"), sorted(os.listdir(output_dir))))
+runs_to_load = run_names
+print(len(run_names))
+for name in run_names: print(name)
+runs_ready = [os.path.exists(f"{output_dir}/{name}/gen_table_w_metrics.jsonl") for name in runs_to_load]
+# runs_ready = [os.path.exists(f"{output_dir}/{name}/gen_table_w_attack_metrics.jsonl") for name in runs_to_load]
+print(f"all runs ready? {all(runs_ready)}\n{runs_ready}")
+# save_name = "analysis_ds_1-21_greedy_redo"
+# save_name = "analysis_ds_1-21_greedy_redo_truncated"
+# save_name = "analysis_ds_1-21_greedy_redo_truncated_sanity_check"
+# save_name = "analysis_ds_1-19_realnews_1-3_v2_hitlist_check"
+# save_name = "analysis_ds_1-20_more_attack"
+# save_name = "analysis_ds_1-23_greedy_gamma_0-25_truncated"
+# save_name = "analysis_ds_1-21_greedy_attacked_updated_truncated"
+# save_name = "analysis_ds_1-23_pile_1-3"
+# save_name = "analysis_ds_1-23_en_1-3"
+save_name = "analysis_ds_1-30_realnews_2-7"
+save_dir = f"input/{save_name}"
+raw_data = load_all_datasets(run_names=runs_to_load,
+                            base_run_dir=output_dir,
+                            meta_name="gen_table_meta.json",
+                            gen_name="gen_table_w_metrics.jsonl",
+                            # gen_name="gen_table_w_attack_metrics.jsonl",
+                            apply_metric_func=True,
+                            # drop_buggy_rows=True,
+                            drop_buggy_rows=False,
+                            # limit_output_tokens=200,
+                            convert_to_pandas=False,
+                            save_ds=True,
+                            save_dir=save_dir)
+print(f"All finished with {save_dir}!!")

lm-watermarking-main/experiments/run_watermarking.py ADDED Viewed

	@@ -0,0 +1,705 @@

+# Basic imports
+import sys
+import os
+import argparse
+from typing import List, Iterable, Optional
+from functools import partial
+import time
+from tqdm import tqdm
+import random
+import math
+from statistics import mean
+import numpy as np
+import torch
+from torch import Tensor
+from tokenizers import Tokenizer
+import wandb
+import matplotlib.pyplot as plt
+# cache path before HF imports just for kicks
+# bc I don't really know when this is pulled by the library
+# TODO change to passing as an arg to the model load fn
+USER = "jkirchen"
+# Huggingface cache
+HF_HOME=f"/cmlscratch/{USER}/.cache/huggingface"
+# HF_HOME=f"/scratch0/{USER}/.cache/huggingface"
+# HF_HOME=f"/scratch1/{USER}/.cache/huggingface"
+os.environ["HF_HOME"] = HF_HOME
+print(os.environ["HF_HOME"])
+# HF classses
+from transformers import (AutoTokenizer,
+                          AutoModelForSeq2SeqLM,
+                          AutoModelForCausalLM,
+                          LogitsProcessorList)
+from datasets import load_dataset, Dataset
+# watermarking micro lib
+from watermark import (BlacklistLogitsProcessor,
+                       add_idx,
+                       check_input_lengths,
+                       check_output_lengths,
+                       tokenize_for_generation,
+                       generate_completions,
+                       evaluate_generation_fluency)
+# better bool flag type for argparse
+from submitit_utils import str2bool
+# some file i/o helpers
+from io_utils import write_jsonlines, write_json, read_jsonlines, read_json
+def main(args):
+    ###########################################################################
+    # Start logging
+    ###########################################################################
+    if not args.no_wandb:
+        # storing slurm info to be sent to wandb to allow auditing logfiles later
+        args.SLURM_JOB_ID = os.getenv("SLURM_JOB_ID")
+        args.SLURM_ARRAY_JOB_ID = os.getenv("SLURM_ARRAY_JOB_ID")
+        args.SLURM_ARRAY_TASK_ID = os.getenv("SLURM_ARRAY_TASK_ID")
+        # start a new wandb run to track this experiment, will send data to it later
+        run = wandb.init(
+            # set the wandb project where this run will be logged
+            project=args.wandb_project,
+            entity=args.wandb_entity,
+            name=args.run_name,
+            # track hyperparameters and run metadata
+            config=args
+        )
+    print(f"Output dir for this run: {args.output_dir}")
+    # notify if exists
+    if os.path.exists(args.output_dir):
+        print(f"Output dir for this run already exists!")
+        print(f"Contents: {sorted(os.listdir(args.output_dir))}")
+    else:
+        # create the output dir where run artifacts are stored
+        os.makedirs(args.output_dir)
+    ###########################################################################
+    # Instantiate model and tokenizer
+    ###########################################################################
+    hf_model_name = args.model_name
+    if "t5" in hf_model_name or "T0" in hf_model_name:
+        model = AutoModelForSeq2SeqLM.from_pretrained(hf_model_name)
+    else:
+        model = AutoModelForCausalLM.from_pretrained(hf_model_name)
+    tokenizer = AutoTokenizer.from_pretrained(hf_model_name)
+    # defaults to device 0
+    # will need to use 'parallelize' for multi-gpu sharding
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = model.to(device)
+    model.eval()
+    ###########################################################################
+    # Load the dataset
+    ###########################################################################
+    dataset_name, dataset_config_name = args.dataset_name, args.dataset_config_name
+    if dataset_name == "cml_pile":
+        subsets = [dataset_config_name]
+        dataset = load_dataset("input/cml_pile.py",
+                                subsets=subsets,
+                                streaming=True,
+                                split=None,
+                                ignore_verifications=True)["train"]
+    else:
+        dataset = load_dataset(dataset_name, dataset_config_name, split="train", streaming=True)
+    # log an example
+    ds_iterator = iter(dataset)
+    idx = 75 # if this is c4, it's the schumacher example lol
+    i = 0
+    while i < idx:
+        next(ds_iterator)
+        i += 1
+    example = next(ds_iterator)
+    print(example)
+    ###########################################################################
+    # Construct the blacklist processor/sampler
+    ###########################################################################
+    all_token_ids = list(tokenizer.get_vocab().values())
+    vocab_size = len(all_token_ids)
+    print(f"Vocabulary size: {vocab_size}")
+    max_new_tokens = args.max_new_tokens
+    min_prompt_tokens = args.min_prompt_tokens
+    init_seed = args.initial_seed
+    dyna_seed=args.dynamic_seed # type not value
+    bl_proportion = args.bl_proportion
+    bl_logit_bias = args.bl_logit_bias
+    bl_type = args.bl_type
+    n_beams = args.num_beams
+    early_stopping = args.early_stopping
+    no_repeat_ngram_size = args.no_repeat_ngram_size
+    store_bl_ids = args.store_bl_ids
+    store_spike_ents = args.store_spike_ents
+    bl_processor = BlacklistLogitsProcessor(bad_words_ids=None,
+                                            store_bl_ids=store_bl_ids,
+                                            store_spike_ents=store_spike_ents,
+                                            eos_token_id=tokenizer.eos_token_id,
+                                            vocab=all_token_ids,
+                                            vocab_size=vocab_size,
+                                            bl_proportion=bl_proportion,
+                                            bl_logit_bias=bl_logit_bias,
+                                            bl_type=bl_type,
+                                            initial_seed=init_seed,
+                                            dynamic_seed=dyna_seed)
+    logit_processor_lst = LogitsProcessorList([bl_processor])
+    # Greedy and basic beam search, default
+    gen_kwargs = dict(
+        max_new_tokens=max_new_tokens,
+        num_beams=n_beams,
+    )
+    if n_beams > 1:
+        # these are only for beam search repetition correction
+        if no_repeat_ngram_size > 0:
+            gen_kwargs.update(dict(no_repeat_ngram_size=no_repeat_ngram_size))
+        gen_kwargs.update(dict(early_stopping=early_stopping))
+    if args.use_sampling:
+        gen_kwargs.update(dict(do_sample=True,
+                                top_k=0,
+                                temperature=args.sampling_temp))
+    if args.all_gas_no_eos:
+        gen_kwargs.update(dict(suppress_tokens=[tokenizer.eos_token_id]))
+    generate_without_blacklist = partial(
+        model.generate,
+        **gen_kwargs
+    )
+    generate_with_blacklist = partial(
+        model.generate,
+        logits_processor=logit_processor_lst,
+        **gen_kwargs
+    )
+    ###########################################################################
+    # Construct the generation and measurement pipeline (lazy)
+    # that pulls from the streaming dataset, applies the generations map funcs
+    ###########################################################################
+    # Set up the pipeline functions
+    if "c4" in dataset_name:
+        columns_to_remove = ["text","timestamp","url"]
+    else:
+        columns_to_remove = []
+    # Construct the data filtering/sampling scheme partials
+    token_kwargs = dict(
+        hf_model_name=hf_model_name,
+        tokenizer=tokenizer,
+        model=model,
+    )
+    if args.input_truncation_strategy == "prompt_length":
+        token_kwargs.update(dict(min_prompt_tokens=min_prompt_tokens))
+    elif args.input_truncation_strategy == "completion_length":
+        token_kwargs.update(dict(max_new_tokens=max_new_tokens))
+    else:
+        ValueError(f"Unknown input truncation strategy {args.input_truncation_strategy}")
+    tokenize_prompts = partial(
+        tokenize_for_generation,
+        **token_kwargs
+    )
+    input_check_kwargs = dict(
+        # min_sample_len = min_prompt_tokens + max_new_tokens,
+        min_sample_len = args.min_sample_tokens, # first line is a bug sometimes with large amounts
+    )
+    if args.input_filtering_strategy == "prompt_length":
+        input_check_kwargs.update(dict(min_prompt_len = min_prompt_tokens,
+                                       min_completion_len = 0))
+    elif args.input_filtering_strategy == "completion_length":
+        input_check_kwargs.update(dict(min_prompt_len = 0,
+                                       min_completion_len = max_new_tokens))
+    elif args.input_filtering_strategy == "prompt_and_completion_length":
+        input_check_kwargs.update(dict(min_prompt_len = min_prompt_tokens,
+                                       min_completion_len = max_new_tokens))
+    else:
+        ValueError(f"Unknown input filtering strategy {args.input_filtering_strategy}")
+    input_check = partial(
+        check_input_lengths,
+        **input_check_kwargs
+    )
+    if args.output_filtering_strategy == "max_new_tokens":
+        output_kwargs = dict(min_output_len = max_new_tokens)
+    elif args.output_filtering_strategy == "no_filter":
+        output_kwargs = dict(min_output_len = 0)
+    else:
+        ValueError(f"Unknown output filtering strategy {args.output_filtering_strategy}")
+    output_check = partial(
+        check_output_lengths,
+        **output_kwargs
+    )
+    gen_completions = partial(
+        generate_completions,
+        max_new_tokens=max_new_tokens,
+        hf_model_name=hf_model_name,
+        tokenizer=tokenizer,
+        model=model,
+        no_bl_partial=generate_without_blacklist,
+        w_bl_partial=generate_with_blacklist,
+        bl_processor_list=logit_processor_lst,
+    )
+    ###########################################################################
+    # Compose/apply the pipeline steps
+    ###########################################################################
+    # Apply the pipeline operations to the dataset
+    indexed_dataset = dataset.map(add_idx, batched=False, with_indices=True)
+    # shuffled the first shuffle_buffer_size rows of the (streaming) dataset
+    if args.shuffle_dataset:
+        shuffled_dataset = indexed_dataset.shuffle(seed=args.shuffle_seed,
+                                                   buffer_size=args.shuffle_buffer_size)
+    else:
+        shuffled_dataset = indexed_dataset
+    # tokenize and truncate the row inputs to create prompts according to the strategy spec'd above
+    tokenized_and_truncated_dataset = shuffled_dataset.map(tokenize_prompts,
+                                                           batched=False,
+                                                           with_indices=True)
+    # filter the rows of the dataset based on length checks for the tokenized prompts and baseline completions
+    input_length_filtered_dataset = tokenized_and_truncated_dataset.filter(input_check,
+                                                                           batched=False,
+                                                                           with_indices=True)
+    # perform generation by calling the models
+    columns_to_remove += ["inputs", "untruncated_inputs"] # these are now materialized and must be dropped externally
+    generations_dataset = input_length_filtered_dataset.map(gen_completions,
+                                                            batched=False,
+                                                            with_indices=True,
+                                                            remove_columns=columns_to_remove)
+    # # filter the dataset a last time based on the lengths of the outputs of the model
+    # output_length_filtered_dataset = generations_dataset.filter(output_check,
+    #                                                             batched=False,
+    #                                                             with_indices=True)
+    ###########################################################################
+    # Main loop - actually executes the generation pipeline.
+    # and accumulates the result rows in a list, assumes list is "small"-ish
+    # and we aren't accumulating any tensors or other memory hogging artifacts
+    ###########################################################################
+    if not args.load_prev_generations:
+        processed_examples = []
+        ds_iterator = iter(generations_dataset)
+        i = 0
+        while i < args.limit_indices:
+            ex = next(ds_iterator)
+            # log basics to stdout
+            print(f"#"*80)
+            print(f"dataset index: {ex['idx']}")
+            print(f"orig_sample_length: {ex['orig_sample_length']}")
+            print(f"prompt_length: {ex['prompt_length']}")
+            print(f"real_completion_length: {ex['real_completion_length']}")
+            print(f"no_bl_num_tokens_generated: {ex['no_bl_num_tokens_generated']}")
+            print(f"w_bl_num_tokens_generated: {ex['w_bl_num_tokens_generated']}")
+            print(f"\ntruncated_input: ")
+            print(ex["truncated_input"])
+            print(f"\nbaseline_completion: ")
+            print(ex["baseline_completion"])
+            print(f"\nno_bl_output: ")
+            print(ex["no_bl_output"])
+            print(f"\nw_bl_output: ")
+            print(ex["w_bl_output"])
+            print(f"\nno_bl_gen_time: ")
+            print(ex["no_bl_gen_time"])
+            print(f"\nno_bl_sec_per_tok: ")
+            print(ex["no_bl_sec_per_tok"])
+            print(f"\nno_bl_tok_per_sec: ")
+            print(ex["no_bl_tok_per_sec"])
+            print(f"\nw_bl_gen_time: ")
+            print(ex["w_bl_gen_time"])
+            print(f"\nw_bl_sec_per_tok: ")
+            print(ex["w_bl_sec_per_tok"])
+            print(f"\nw_bl_tok_per_sec: ")
+            print(ex["w_bl_tok_per_sec"])
+            processed_examples.append(ex)
+            if output_check(ex) == True:
+                i += 1
+            else:
+                print(f"\nGeneration too short, saving outputs, but not incrementing counter...\n",
+                      f"{i} of {len(processed_examples)} rows were satisfactory so far",
+                      f"current generation overhead ratio: {round(len(processed_examples)/(i+1), 3)}",
+                      f"completed {round(i/args.limit_indices, 2)} of total")
+    print(f"#"*80,
+          f"\nGeneration output length check overhead was num rows processed={len(processed_examples)}",
+          f"for {args.limit_indices} samples. Ratio: {round(len(processed_examples)/args.limit_indices, 3)}")
+    ###########################################################################
+    # Generation jsonl dumping/loading
+    ###########################################################################
+    gen_table_meta_path = f"{args.output_dir}/gen_table_meta.json"
+    gen_table_path = f"{args.output_dir}/gen_table.jsonl"
+    safe_gen_table_path = f"{args.output_dir}/gen_table_safe.jsonl"
+    args.gen_table_already_existed = False
+    if not args.load_prev_generations:
+        if os.path.exists(gen_table_path):
+            print(f"Found existing generation files at this output dir: {args.output_dir}")
+            print(f"Writing generations at alternate, safe path and exiting. Note! this only works once. "
+                  f"Safe version will get overwritten next time ... ")
+            gen_table_path = f"{args.output_dir}/gen_table_safe.jsonl"
+            args.gen_table_already_existed = True
+        gen_table_meta = args.__dict__
+        gen_table = processed_examples
+        write_jsonlines(gen_table, gen_table_path)
+        write_json(gen_table_meta,gen_table_meta_path,indent=4)
+        if args.gen_table_already_existed:
+            # finish the wandb run
+            if not args.no_wandb: run.finish()
+            return # from main, for safety
+    else:
+        print(f"Loading previously generated outputs for evaluation via oracle model and metrics...")
+        assert os.path.exists(gen_table_meta_path), f"failed file check for prev generations metadata json file: {gen_table_meta_path}"
+        assert os.path.exists(gen_table_path), f"failed file check for prev generations jsonl file: {gen_table_path}"
+        curr_gen_table_meta = args.__dict__.copy()
+        prev_gen_table_meta = read_json(gen_table_meta_path)
+        assert not prev_gen_table_meta["gen_table_already_existed"], f"failed for safety bc 'gen_table_already_existed' was true in the metadata file in this dir, indicating a possible issue"
+        assert not os.path.exists(safe_gen_table_path), f"failed for safety bc there is a secondary 'safe' marked file in this dir indicating a possible issue"
+        params_to_ignore = ["load_prev_generations","SLURM_JOB_ID","SLURM_ARRAY_JOB_ID","SLURM_ARRAY_TASK_ID"]
+        for k in params_to_ignore:
+            del curr_gen_table_meta[k]
+            del prev_gen_table_meta[k]
+        assert curr_gen_table_meta == prev_gen_table_meta, "failed safety check that current script params equal the params for the prev generations being loaded"
+        # gen_table_meta = argparse.Namespace(**args.__dict__)
+        gen_table_meta = args
+        gen_table = [ex for ex in read_jsonlines(gen_table_path)]
+    if args.generate_only:
+        # finish the wandb run
+        if not args.no_wandb: run.finish()
+        return # early exit, will reload later for ppl scoring
+    # Create a new dataset object either from the loop over examples
+    # or from the reloaded json lines
+    # gen_table_ds = Dataset.from_generator(ex for ex in gen_table) # hack since from_list is newer, and had 2.4.0
+    gen_table_ds = Dataset.from_list(gen_table)
+    ###########################################################################
+    # Perplexity (PPL) evaluation
+    # which is a separate step partially bc it requires a different model on gpu
+    ###########################################################################
+    # Load the oracle model for PPL measurement
+    # Assume on single GPU and need to free orig model memory for oracle model
+    if model is not None:
+        model = model.to(torch.device("cpu"))
+        del model
+    oracle_model_name = args.oracle_model_name
+    print(f"Loading oracle model: {oracle_model_name}")
+    oracle_tokenizer = AutoTokenizer.from_pretrained(oracle_model_name)
+    oracle_model = AutoModelForCausalLM.from_pretrained(oracle_model_name).to(device)
+    oracle_model.eval()
+    # construct fluency/ppl partial
+    eval_gen_metrics = partial(
+        evaluate_generation_fluency,
+        oracle_model_name=oracle_model_name,
+        oracle_model=oracle_model,
+        oracle_tokenizer=oracle_tokenizer
+    )
+    print(f"Computing metrics on model generations: {gen_table_ds}")
+    gen_table_w_metrics_ds = gen_table_ds.map(eval_gen_metrics, batched=False, with_indices=True)
+    print(f"#"*80)
+    print(f"baseline avg PPL: {mean(gen_table_w_metrics_ds['baseline_ppl'])}")
+    print(f"baseline avg loss: {mean(gen_table_w_metrics_ds['baseline_loss'])}")
+    print(f"no_bl avg PPL: {mean(gen_table_w_metrics_ds['no_bl_ppl'])}")
+    print(f"no_bl avg loss: {mean(gen_table_w_metrics_ds['no_bl_loss'])}")
+    print(f"w_bl avg PPL: {mean(gen_table_w_metrics_ds['w_bl_ppl'])}")
+    print(f"w_bl avg loss: {mean(gen_table_w_metrics_ds['w_bl_loss'])}")
+    # clear the model just for fun
+    oracle_model = oracle_model.to(torch.device("cpu"))
+    del oracle_model
+    gen_table_w_metrics_path = f"{args.output_dir}/gen_table_w_metrics.jsonl"
+    if os.path.exists(gen_table_w_metrics_path):
+        print(f"Found existing generation files with metrics added at this output dir. Overwriting anyway :\ -> {args.output_dir}")
+    gen_table_w_metrics_lst = [ex for ex in gen_table_w_metrics_ds]
+    write_jsonlines(gen_table_w_metrics_lst, gen_table_w_metrics_path)
+    # finish the wandb run
+    run.finish()
+    return
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run watermarked huggingface LM generation pipeline")
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="facebook/opt-2.7b",
+        help="Main model, path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default="c4",
+        help="The name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default="realnewslike",
+        help="The configuration name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--shuffle_dataset",
+        type=str2bool,
+        default=False,
+        help="Whether to shuffle the dataset before sampling.",
+    )
+    parser.add_argument(
+        "--shuffle_seed",
+        type=int,
+        default=1234,
+        help="The seed to use for dataset shuffle op.",
+    )
+    parser.add_argument(
+        "--shuffle_buffer_size",
+        type=int,
+        default=10_000,
+        help="The buffer size to use for dataset shuffle op - takes n rows first, then shuffles those indices",
+    )
+    parser.add_argument(
+        "--max_new_tokens",
+        type=int,
+        default=100,
+        help="The number of tokens to generate using the model, and the num tokens removed from real text sample",
+    )
+    parser.add_argument(
+        "--min_prompt_tokens",
+        type=int,
+        default=50, # 500
+        help="The number of examples (first N) to process from the dataset.",
+    )
+    parser.add_argument(
+        "--min_sample_tokens",
+        type=int,
+        default=0,
+        help="The the minimum length of raw prompt samples to consider.",
+    )
+    parser.add_argument(
+        "--limit_indices",
+        type=int,
+        default=5, # 500
+        help="The number of examples (first N) to process from the dataset.",
+    )
+    parser.add_argument(
+        "--input_truncation_strategy",
+        type=str,
+        default="completion_length",
+        choices=["completion_length", "prompt_length"],
+        help="The strategy to use when tokenizing and truncating raw inputs to make prompts.",
+    )
+    parser.add_argument(
+        "--input_filtering_strategy",
+        type=str,
+        default="completion_length",
+        choices=["completion_length", "prompt_length", "prompt_and_completion_length"],
+        help="The strategy to use when tokenizing and truncating raw inputs to make prompts.",
+    )
+    parser.add_argument(
+        "--output_filtering_strategy",
+        type=str,
+        default="no_filter",
+        choices=["no_filter", "max_new_tokens"],
+        help=(f"The strategy to use when filtering/skipping rows if the model didn't ",
+              f"generate enough tokens to facilitate analysis.")
+    )
+    parser.add_argument(
+        "--initial_seed",
+        type=int,
+        default=1234,
+        help=("The initial seed to use in the blacklist randomization process.",
+              "Is unused if the process is markov generally. Can be None."),
+    )
+    parser.add_argument(
+        "--dynamic_seed",
+        type=str,
+        default="markov_1",
+        choices=[None, "initial", "markov_1"],
+        help="The seeding procedure to use when sampling the blacklist at each step.",
+    )
+    parser.add_argument(
+        "--bl_proportion",
+        type=float,
+        default=0.5,
+        help="The ratio of blacklist to whitelist tokens when splitting the vocabulary",
+    )
+    parser.add_argument(
+        "--bl_logit_bias",
+        type=float,
+        default=1.0,
+        help="The amount of bias (absolute) to add to the logits in the whitelist half of the vocabulary at every step",
+    )
+    parser.add_argument(
+        "--bl_type",
+        type=str,
+        default="soft",
+        choices=["soft", "hard"],
+        help="The type of blacklisting being performed.",
+    )
+    parser.add_argument(
+        "--num_beams",
+        type=int,
+        default=1,
+        help="The number of beams to use where '1' is no beam search.",
+    )
+    parser.add_argument(
+        "--no_repeat_ngram_size",
+        type=int,
+        default=0,
+        # default=8,
+        help="ngram size to force the model not to generate, can't be too small or model is handicapped, too large and blows up in complexity.",
+    )
+    parser.add_argument(
+        "--early_stopping",
+        type=str2bool,
+        default=False,
+        help="Whether to use early stopping, only for beam search.",
+    )
+    # parser.add_argument(
+    #     "--hard_min_length",
+    #     type=str2bool,
+    #     default=False,
+    #     help="Whether to use the min length logits processor to force the generations to be max_new_tokens.",
+    # )
+    parser.add_argument(
+        "--oracle_model_name",
+        type=str,
+        default="EleutherAI/gpt-j-6B",
+        help="PPL scoring, or oracle model, path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--no_wandb",
+        type=str2bool,
+        default=False,
+        help="Whether to log to wandb.",
+    )
+    parser.add_argument(
+        "--wandb_project",
+        type=str,
+        default="lm-blacklisting",
+        help="The name of the wandb project.",
+    )
+    parser.add_argument(
+        "--wandb_entity",
+        type=str,
+        default="jwkirchenbauer",
+        help="The wandb entity/user for the project.",
+    )
+    parser.add_argument(
+        "--run_name",
+        type=str,
+        default=None,
+        help="The unique name for the run.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./output",
+        help="The unique name for the run.",
+    )
+    parser.add_argument(
+        "--load_prev_generations",
+        type=str2bool,
+        default=False,
+        help=("Whether to run generations or load from a json lines in the output_dir. "
+             "If True, this file must exist and meta/args must match"),
+    )
+    parser.add_argument(
+        "--store_bl_ids",
+        type=str2bool,
+        default=False,
+        help=("Whether to store all the blacklists while generating with bl processor. "),
+    )
+    parser.add_argument(
+        "--store_spike_ents",
+        type=str2bool,
+        default=False,
+        help=("Whether to store the spike entropies while generating with bl processor. "),
+    )
+    parser.add_argument(
+        "--use_sampling",
+        type=str2bool,
+        default=False,
+        help=("Whether to perform sampling during generation. (non-greedy decoding)"),
+    )
+    parser.add_argument(
+        "--sampling_temp",
+        type=float,
+        default=0.7,
+        help="The temperature to use when generating using multinom sampling",
+    )
+    parser.add_argument(
+        "--generate_only",
+        type=str2bool,
+        default=False,
+        help=("Whether to only produce outputs and not evaluate anything like ppl"),
+    )
+    parser.add_argument(
+        "--all_gas_no_eos",
+        type=str2bool,
+        default=False,
+        help=("Whether to weight the EOS token as -inf"),
+    )
+    args = parser.parse_args()
+    main(args)

lm-watermarking-main/experiments/submitit_utils.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# stuff specifically for the sklearn logic
+from typing import Mapping
+from functools import partial, reduce
+import operator
+from itertools import product
+import argparse
+###############################################################################
+# A grid search convenience class
+###############################################################################
+class ParameterGrid:
+    """logic YOINKED from sklearn <3
+    def worth just using the lib itself, or something fancier in future for
+    efficient sampling etc. It's implemented as an iterator interface but thats
+    probs not necessary"""
+    def __init__(self, params):
+        # we may want to product a few sets of parameters
+        # independently of eachother, so expects a List[Mapping]
+        if isinstance(params, Mapping):
+            self.params = [params]
+        else:
+            self.params = params
+        # removed all checking code soooo make sure your
+        # param dict is already nice and conforming
+    def __iter__(self):
+        """Iterate over the points in the grid.
+        Returns
+        -------
+        params : iterator over dict of str to any
+            Yields dictionaries mapping each estimator parameter to one of its
+            allowed values.
+        """
+        for p in self.params:
+            # Always sort the keys of a dictionary, for reproducibility
+            items = sorted(p.items())
+            if not items:
+                yield {}
+            else:
+                keys, values = zip(*items)
+                for v in product(*values):
+                    params = dict(zip(keys, v))
+                    yield params
+    def __len__(self):
+        """Number of points on the grid."""
+        # Product function that can handle iterables (np.product can't).
+        product = partial(reduce, operator.mul)
+        return sum(
+            product(len(v) for v in p.values()) if p else 1 for p in self.params
+        )
+###############################################################################
+# little "oneliner" reduce thingy that turns your shallow dict into
+# the list [k1, v1, k2, v2, k3, v3 ...]
+# and optionally "k1 v1 k2 v2 k3 v3"
+def flatten_dict(dict, to_string=False, sep=" "):
+    flat_dict = reduce(operator.iconcat,dict.items() , [])
+    if to_string:
+        try:
+            return sep.join([str(elm) for elm in flat_dict])
+        except:
+            raise ValueError(f'Error converting dict={flat_dict} to whitespace joined string')
+    else:
+        return flat_dict
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')

lm-watermarking-main/experiments/watermark.py ADDED Viewed

	@@ -0,0 +1,820 @@

+# micro lib to implement the watermarking extensions to LM generation
+# as well as utils for eval/validaiton
+from typing import List, Optional, Callable
+import time
+import random
+import math
+import torch
+import numpy as np
+from torch import Tensor
+from tokenizers import Tokenizer
+from transformers import LogitsProcessor, LogitsProcessorList, set_seed
+def tokenize_and_truncate(example: dict,
+                          completion_length: int = None,
+                          prompt_length: int = None,
+                          hf_model_name: str = None,
+                          tokenizer = None,
+                          model_max_seq_len: int = 4096):
+    """take hf dataset entry and preprocess it for completion by a model"""
+    assert hf_model_name is not None, "need model name to know whether to adjust wrt special tokens"
+    assert "text" in example, "expects 'text' field to be present"
+    # tokenize
+    inputs = tokenizer.encode(example["text"], return_tensors="pt", truncation=True, max_length=model_max_seq_len)
+    example.update({"untruncated_inputs": inputs})
+    if (completion_length is not None) and (prompt_length is None):
+        # leave at least one token as prefix # FIXME I think plus 1 since 0 is start tok
+        slice_length = min(inputs.shape[1]-1, completion_length)
+    elif (prompt_length is not None) and (completion_length is None):
+        desired_comp_len = (inputs.shape[1]-1) - prompt_length
+        slice_length = desired_comp_len if desired_comp_len > 0 else 0
+    else:
+        raise ValueError((f"Can only tokenize and truncate based on either the desired prompt length or desired completion length,",
+                          f" but got completion_length:{completion_length},prompt_length:{prompt_length}"))
+    # truncate
+    inputs = inputs[:,:inputs.shape[1]-slice_length]
+    # logic depending on special tokens for the model
+    if "t5" in hf_model_name or "T0" in hf_model_name:
+        inputs[0,-1] = 1
+    # else: pass
+    example.update({"inputs": inputs})
+    return example
+class BlacklistLogitsProcessor(LogitsProcessor):
+    """
+    [`LogitsProcessor`] that enforces that specified sequences will never be sampled.
+    Args:
+        bad_words_ids (`List[List[int]]`):
+            List of list of token ids that are not allowed to be generated. In order to get the token ids of the words
+            that should not appear in the generated text, use `tokenizer(bad_words, add_prefix_space=True,
+            add_special_tokens=False).input_ids`.
+        eos_token_id (`int`):
+            The id of the *end-of-sequence* token.
+    """
+    def __init__(self,
+                bad_words_ids: List[List[int]],
+                eos_token_id: int,
+                vocab: list[int],
+                vocab_size: int,
+                bl_proportion: float=0.5,
+                bl_logit_bias: float=1.0,
+                bl_type: str = "hard", # "soft"
+                initial_seed: int=None,
+                dynamic_seed: str=None, # "initial", "markov_1", None
+                store_bl_ids: bool=True,
+                store_spike_ents: bool = False,
+                noop_blacklist: bool = False,
+                ):
+        self.vocab = vocab
+        self.vocab_size = vocab_size
+        self.bl_proportion = bl_proportion
+        self.bl_logit_bias = bl_logit_bias
+        self.bl_type = bl_type
+        if initial_seed is None:
+            self.initial_seed = None
+            assert dynamic_seed != "initial"
+        else:
+            random.seed(initial_seed)
+            self.initial_seed = initial_seed
+        self.dynamic_seed = dynamic_seed
+        self.eos_token_id = eos_token_id
+        # self.bad_words_id_length_1 = self._prepare_bad_words(bad_words_ids)
+        self.bad_words_mask: Optional[torch.LongTensor] = None
+        self.store_bl_ids = store_bl_ids
+        self.bl_ids = None
+        self.store_spike_ents = store_spike_ents
+        self.spike_entropies = None
+        # hack to replace this with an approximation of infinite bias
+        # so that the expectation coefficient will come out to 1.0
+        if self.bl_type == "hard":
+            self.bl_logit_bias = 10000 # FIXME to a value that is actually close to the largest soft watermark used
+        alpha = torch.exp(torch.tensor(self.bl_logit_bias)).item()
+        # gamma = self.bl_proportion
+        gamma = 1.0-self.bl_proportion
+        self.alpha = alpha
+        self.gamma = gamma
+        self.z_value = ((1-gamma)*(alpha-1))/(1-gamma+(alpha*gamma))
+        self.expected_wl_coef = (gamma*alpha)/(1-gamma+(alpha*gamma))
+        # catch for overflow when bias is "infinite"
+        if self.alpha == torch.inf:
+            self.z_value = 1.0
+            self.expected_wl_coef = 1.0
+        self.noop_blacklist = noop_blacklist
+        if self.noop_blacklist: print(f"Blacklist processor for accounting only, no rescoring of logits")
+        self.g_cuda = None
+        self.large_prime = 15485863
+    @property
+    def blacklisted_ids(self):
+        assert self.store_bl_ids, "Need to instantiate processor with `store_bl_ids` to be able to retrieve them later"
+        # flatten the each indexes blacklist
+        l_of_bl_ids = [[] for _ in range(len(self.bl_ids))]
+        for b_idx, batch in enumerate(self.bl_ids):
+            for l_of_l, seed in batch:
+                bl_ids = [l[0] for l in l_of_l] # this was the main line, maybe unnecessary now?
+                l_of_bl_ids[b_idx].append((bl_ids,seed))
+        return l_of_bl_ids
+    def get_and_clear_stored_bl_ids(self):
+        old_bl_ids = self.bl_ids
+        self.bl_ids = None
+        return old_bl_ids
+    def get_spike_entropies(self):
+        spike_ents = [[] for _ in range(len(self.spike_entropies))]
+        for b_idx, ent_tensor_list in enumerate(self.spike_entropies):
+            for ent_tensor in ent_tensor_list:
+                spike_ents[b_idx].append(ent_tensor.item())
+        return spike_ents
+    def get_and_clear_stored_spike_ents(self):
+        spike_ents = self.get_spike_entropies()
+        self.spike_entropies = None
+        return spike_ents
+    def compute_spike_entropy(self, scores):
+        # precomputed z value in init
+        probs = scores.softmax(dim=-1)
+        denoms = 1+(self.z_value*probs)
+        renormed_probs = probs / denoms
+        sum_renormed_probs = renormed_probs.sum()
+        return sum_renormed_probs
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        self.bad_words_id_length_1 = [None for _ in range(input_ids.shape[0])]
+        if self.g_cuda is None:
+            self.g_cuda = torch.Generator(device=input_ids.device)
+        for b_idx in range(input_ids.shape[0]):
+            if self.dynamic_seed == "initial":
+                self.g_cuda.manual_seed(self.large_prime*self.initial_seed)
+            elif self.dynamic_seed == "markov_1":
+                self.g_cuda.manual_seed(self.large_prime*input_ids[b_idx][-1].item())
+            elif self.dynamic_seed is None:
+                # let the rng evolve naturally - this is not a realistic setting
+                pass
+            bl_ct = int(self.vocab_size*self.bl_proportion)
+            blacklist_ids = torch.randperm(self.vocab_size, device=input_ids.device, generator=self.g_cuda)[:bl_ct] # ty Yuxin :]
+            if self.store_bl_ids:
+                if self.bl_ids is None: self.bl_ids = [[] for _ in range(input_ids.shape[0])]
+                self.bl_ids[b_idx].append((blacklist_ids,input_ids.tolist()[b_idx][-1]))
+            if self.store_spike_ents:
+                if self.spike_entropies is None: self.spike_entropies = [[] for _ in range(input_ids.shape[0])]
+                self.spike_entropies[b_idx].append(self.compute_spike_entropy(scores[b_idx]))
+            # self.bad_words_id_length_1[b_idx] = self._prepare_bad_words(blacklist_ids)
+            # this logic may not really be necessary for our usecase
+            self.bad_words_id_length_1[b_idx] = blacklist_ids
+        if not self.noop_blacklist:
+            self.bad_words_mask = self._calc_curr_bad_word_mask(scores)
+            scores = self._set_scores_to_inf_for_banned_tokens(scores)
+        return scores
+    def _prepare_bad_words(self, bad_words_ids: List[List[int]]) -> list[int]:
+        bad_words_ids = list(filter(lambda bad_token_seq: bad_token_seq != [self.eos_token_id], bad_words_ids))
+        return bad_words_ids
+        # used to have more logic, not used now
+    def _calc_curr_bad_word_mask(self, scores: torch.FloatTensor) -> torch.BoolTensor:
+        bad_words_mask = torch.zeros_like(scores)
+        for b_idx in range(len(self.bad_words_id_length_1)):
+            bad_words_mask[b_idx][self.bad_words_id_length_1[b_idx]] = 1
+        final_mask = bad_words_mask.bool()
+        return final_mask
+    def _set_scores_to_inf_for_banned_tokens(
+        self, scores: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be a
+        list of list of banned tokens to ban in the format [[batch index, vocabulary position],...
+        Args:
+            scores: logits distribution of shape (batch size, vocabulary size)
+            banned_tokens: list of list of tokens to ban of length (batch_size)
+            # NOTE^^ Omitted logic for dynamic mask based on multi-token ban words
+        """
+        if self.bl_type == "hard":
+            scores = scores.masked_fill(self.bad_words_mask, -float("inf"))
+        elif self.bl_type == "soft":
+            whitelist_mask = torch.logical_not(self.bad_words_mask)
+            blacklist_mask = self.bad_words_mask
+            scores[whitelist_mask] = scores[whitelist_mask] + self.bl_logit_bias
+            # scores[blacklist_mask] = scores[blacklist_mask] - self.bl_logit_bias  # additive only
+        else:
+            raise NotImplementedError(f"unrecognized bl type {self.bl_type}!")
+        return scores
+def score_sequence(inputs: Tensor = None,
+                   outputs: Tensor = None,
+                   tokenizer: Tokenizer = None,
+                #    logits_processor: LogitsProcessor = None,
+                   initial_seed: int = None,
+                   dynamic_seed: str = None,
+                   bl_proportion: float = None,
+                   use_cuda: bool = True,
+                   record_hits: bool = False,
+                   debug: bool = True,
+                #    trim_tokens: int = 2,
+                ):
+    assert  (inputs is not None) and \
+            (outputs is not None) and \
+            (tokenizer is not None),"output tensor, tokenizer, and bl params req'd"
+            # (logits_processor is not None),
+    vocabulary = list(tokenizer.get_vocab().values())
+    vocab_size = len(vocabulary)
+    model_generations = outputs.tolist()[0] # these are tensors unpack once for speed
+    # toks_generated = model_generations[num_orig_input_tokens:]
+    toks_generated = model_generations
+    num_toks_generated = len(toks_generated)
+    # num_toks_to_trim = trim_tokens*2
+    # if (num_toks_generated-num_toks_to_trim > 0) == False:
+    #     return -1, -1
+    # assert num_toks_generated > num_toks_to_trim, f"Need more than {num_toks_to_trim} toks total since we trim start and end a bit."
+    # toks_generated = toks_generated[trim_tokens:-trim_tokens]
+    if initial_seed is not None:
+        random.seed(initial_seed)
+    device = (torch.device("cuda") if use_cuda else torch.device("cpu"))
+    g_cuda = torch.Generator(device=device)
+    large_prime = 15485863
+    bl_hits, hit_list = 0, []
+    prev_token = inputs[0][-1].item()
+    # prev_token = toks_generated[0] # haven't decided whether this edge effect matters
+    # for idx,tok_gend in enumerate(toks_generated[1:]):
+    for idx,tok_gend in enumerate(toks_generated):
+        # prev_token = model_generations[num_orig_input_tokens+idx-1]
+        if dynamic_seed == "initial":
+            g_cuda.manual_seed(large_prime*initial_seed)
+        elif dynamic_seed == "markov_1":
+            g_cuda.manual_seed(large_prime*prev_token)
+        elif dynamic_seed is None:
+            # let the rng evolve naturally - this is not a realistic setting
+            pass
+        bl_ct = int(vocab_size*bl_proportion)
+        posthoc_blacklist = torch.randperm(vocab_size, device=device, generator=g_cuda)[:bl_ct] # ty Yuxin :]
+        tok_in_ph_bl = tok_gend in posthoc_blacklist
+        if tok_in_ph_bl:
+            bl_hits += 1
+            hit_list.append(True)
+        else:
+            hit_list.append(False)
+        if debug:
+            decoded_token = tokenizer.decode(tok_gend, skip_special_tokens=True)
+            print(f"Token generated: '{decoded_token}' was in the blacklist {tok_in_ph_bl}")
+        prev_token = tok_gend
+    if debug:
+        print(f"wl hits / num tokens : {num_toks_generated-bl_hits}/{num_toks_generated} = {(num_toks_generated-bl_hits)/num_toks_generated:.02f}")
+        print(f"bl hits / num tokens : {bl_hits}/{num_toks_generated} = {bl_hits/num_toks_generated:.02f}")
+    if record_hits:
+        return bl_hits, num_toks_generated, hit_list
+    # bl_fraction = bl_hits/num_toks_generated
+    return bl_hits, num_toks_generated
+def tokenize_for_generation(example: dict,
+                            idx: int,
+                            max_new_tokens: int=None,
+                            min_prompt_tokens: int=None,
+                            hf_model_name : str=None,
+                            tokenizer: Tokenizer=None,
+                            model: torch.nn.Module=None):
+    # preprocessing, generation & scoring
+    assert isinstance(example, dict), "Expect no batch dimension currently!"
+    # preprocess for model generation/completion
+    example = tokenize_and_truncate(example,
+                                    completion_length=max_new_tokens,
+                                    prompt_length=min_prompt_tokens,
+                                    hf_model_name=hf_model_name,
+                                    tokenizer=tokenizer,
+                                    # model_max_seq_len=model.config.max_position_embeddings)
+                                    model_max_seq_len=None)
+    inputs = example["inputs"]
+    # for calculating the baseline violation rate across the "gold" completion
+    untruncated_inputs = example["untruncated_inputs"]
+    # decode the preprocessed input to store for audit
+    re_decoded_input = tokenizer.batch_decode(inputs, skip_special_tokens=True)[0]
+    example.update({"truncated_input":re_decoded_input})
+    # also decode the original suffix of the input for audit as the baseline
+    decoded_untruncated_input = tokenizer.batch_decode(untruncated_inputs, skip_special_tokens=True)[0]
+    example.update({"baseline_completion":decoded_untruncated_input.replace(re_decoded_input,"")})
+    example.update({
+        "orig_sample_length"            : untruncated_inputs.shape[1],
+        "prompt_length"                 : inputs.shape[1],
+        "real_completion_length"        : untruncated_inputs.shape[1] - inputs.shape[1],
+    })
+    return example
+def generate_completions(example: dict,
+                        idx: int,
+                        max_new_tokens: int=None,
+                        hf_model_name : str=None,
+                        tokenizer: Tokenizer=None,
+                        model: torch.nn.Module=None,
+                        no_bl_partial: Callable=None,
+                        w_bl_partial: Callable=None,
+                        # return_logits: bool=False,
+                        bl_processor_list: LogitsProcessorList=None):
+    # preprocessing, generation & scoring
+    assert isinstance(example, dict), "Expect no batch dimension currently!"
+    # # preprocess for model generation/completion
+    # example = tokenize_and_truncate(example,
+    #                                 completion_length=max_new_tokens,
+    #                                 hf_model_name=hf_model_name,
+    #                                 tokenizer=tokenizer,
+    #                                 model_max_seq_len=model.config.max_position_embeddings)
+    # inputs = example["inputs"]
+    # # for calculating the baseline violation rate across the "gold" completion
+    # untruncated_inputs = example["untruncated_inputs"]
+    # # decode the preprocessed input to store for audit
+    # re_decoded_input = tokenizer.batch_decode(inputs, skip_special_tokens=True)[0]
+    # example.update({"truncated_input":re_decoded_input})
+    # # also decode the original suffix of the input for audit as the baseline
+    # decoded_untruncated_input = tokenizer.batch_decode(untruncated_inputs, skip_special_tokens=True)[0]
+    # example.update({"baseline_completion":decoded_untruncated_input.replace(re_decoded_input,"")})
+    inputs = example["inputs"]
+    re_decoded_input = example["truncated_input"]
+    # call the vanilla and watermarked generation function wrappers with the preprocessed inputs
+    with torch.no_grad():
+        samples_taken = 0
+        max_retries = 10
+        success = False
+        while (success is False) and (samples_taken < max_retries):
+            samples_taken += 1
+            # set_seed(1234) # debugging the error when using sampling # leaving this off for now
+            start_generation = time.time()
+            outputs_no_bl = no_bl_partial(inputs.to(model.device))
+            example["no_bl_gen_time"] = time.time() - start_generation
+            # set_seed(1234) # debugging the error when using sampling
+            start_generation = time.time()
+            outputs_w_bl = w_bl_partial(inputs.to(model.device))
+            example["w_bl_gen_time"] = time.time() - start_generation
+            # if return_logits:
+            #     output_no_bl_dict = outputs_no_bl
+            #     logits_no_bl = output_no_bl_dict.scores
+            #     outputs_no_bl = output_no_bl_dict.sequences
+            #     example["logits_no_bl"] = logits_no_bl
+            #     output_w_bl_dict = outputs_w_bl
+            #     logits_w_bl = output_w_bl_dict.scores
+            #     outputs_w_bl = output_w_bl_dict.sequences
+            #     example["logits_w_bl"] = logits_w_bl
+            if bl_processor_list:
+                if bl_processor_list[0].bl_ids is not None:
+                    example["bl_ids"] = bl_processor_list[0].get_and_clear_stored_bl_ids()
+                if bl_processor_list[0].spike_entropies is not None:
+                    example["spike_entropies"] = bl_processor_list[0].get_and_clear_stored_spike_ents()
+            try:
+                # decode and store the new generations for auditing
+                no_bl_decoded_output = tokenizer.batch_decode(outputs_no_bl, skip_special_tokens=True)[0]
+                example.update({"no_bl_output":no_bl_decoded_output.replace(re_decoded_input,"")})
+                w_bl_decoded_output = tokenizer.batch_decode(outputs_w_bl, skip_special_tokens=True)[0]
+                example.update({"w_bl_output":w_bl_decoded_output.replace(re_decoded_input,"")})
+                success = True
+            except:
+                # log what happened
+                print(f"Error while trying to decode the outputs of the model...")
+                if samples_taken == 1:
+                    print(f"truncated_input: {inputs.tolist()}")
+                print(f"Result of attempt {samples_taken}")
+                print(f"shape outputs_no_bl: {outputs_no_bl.shape}")
+                no_bl_toks = outputs_no_bl.tolist()[0]
+                print(f"outputs_no_bl: {no_bl_toks}")
+                print(f"outputs_no_bl min: {min(no_bl_toks)}")
+                print(f"outputs_no_bl max: {max(no_bl_toks)}")
+                print(f"shape outputs_w_bl: {outputs_w_bl.shape}")
+                w_bl_toks = outputs_w_bl.tolist()[0]
+                print(f"outputs_w_bl: {w_bl_toks}")
+                print(f"outputs_w_bl min: {min(w_bl_toks)}")
+                print(f"outputs_w_bl max: {max(w_bl_toks)}")
+        if success is False:
+            print(f"Unable to get both a no_bl and w_bl output that were decodeable after {samples_taken} tries, returning empty strings.")
+            example.update({"no_bl_output":""})
+            example.update({"w_bl_output":""})
+            if bl_processor_list:
+                if bl_processor_list[0].bl_ids is not None:
+                    example["bl_ids"] = []
+                if bl_processor_list[0].spike_entropies is not None:
+                    example["spike_entropies"] = []
+    # Able to get lengths in here by checking
+    # truncated input shape versus the output shape
+    example.update({
+        # "baseline_num_tokens_generated" : untruncated_inputs.shape[1] - inputs.shape[1], # want this earlier now
+        "no_bl_num_tokens_generated"    : outputs_no_bl.shape[1] - inputs.shape[1],
+        "w_bl_num_tokens_generated"     : outputs_w_bl.shape[1] - inputs.shape[1]
+    })
+    example.update({
+        "no_bl_sec_per_tok"             : example["no_bl_gen_time"]/example["no_bl_num_tokens_generated"],
+        "no_bl_tok_per_sec"             : example["no_bl_num_tokens_generated"]/example["no_bl_gen_time"],
+        "w_bl_sec_per_tok"             : example["w_bl_gen_time"]/example["w_bl_num_tokens_generated"],
+        "w_bl_tok_per_sec"             : example["w_bl_num_tokens_generated"]/example["w_bl_gen_time"],
+    })
+    # now done externally because these persist outside this func
+    # # remove any fields we don't need to keep
+    # del example["inputs"]
+    # del example["untruncated_inputs"]
+    return example
+def compute_bl_metrics(example: dict,
+                        idx: int,
+                        hf_model_name: str = None,
+                        tokenizer: Tokenizer=None,
+                        initial_seed: int = None,
+                        dynamic_seed: str = None,
+                        bl_proportion: float = None,
+                        use_cuda: bool = None,
+                        record_hits: bool = False,
+                        limit_output_tokens: int = 0):
+    # if example["idx"] == 3: breakpoint()
+    # okay need to catch an odd bug here and fix things
+    baseline_before = example["baseline_completion"]
+    example["baseline_completion"] = baseline_before.replace(example["truncated_input"][:-1],"")
+    if example["baseline_completion"] != baseline_before:
+        print("baseline input replacement bug occurred!")
+    no_bl_before = example["no_bl_output"]
+    example["no_bl_output"] = no_bl_before.replace(example["truncated_input"][:-1],"")
+    if example["no_bl_output"] != no_bl_before:
+        print("no_bl_output input replacement bug occurred!")
+    w_bl_before = example["w_bl_output"]
+    example["w_bl_output"] = w_bl_before.replace(example["truncated_input"][:-1],"")
+    if example["w_bl_output"] != w_bl_before:
+        print("w_bl_output input replacement bug occurred!")
+    if ("w_bl_output_attacked" in example):
+        w_bl_attacked_before = example["w_bl_output_attacked"]
+        example["w_bl_output_attacked"] = w_bl_attacked_before.replace(example["truncated_input"][:-1],"")
+        if example["w_bl_output_attacked"] != w_bl_attacked_before:
+            print("w_bl_output_attacked input replacement bug occurred!")
+    ##########
+    # preprocess for model generation/completion
+    inputs = tokenize_and_truncate({"text":example["truncated_input"]},
+                                    completion_length=0,
+                                    hf_model_name=hf_model_name,
+                                    tokenizer=tokenizer)["inputs"]
+    baseline_outputs = tokenize_and_truncate({"text":example["baseline_completion"]},
+                                        completion_length=0,
+                                        hf_model_name=hf_model_name,
+                                        tokenizer=tokenizer)["inputs"][:,1:]
+    no_bl_outputs = tokenize_and_truncate({"text":example["no_bl_output"]},
+                                        completion_length=0,
+                                        hf_model_name=hf_model_name,
+                                        tokenizer=tokenizer)["inputs"][:,1:]
+    w_bl_outputs = tokenize_and_truncate({"text":example["w_bl_output"]},
+                                        completion_length=0,
+                                        hf_model_name=hf_model_name,
+                                        tokenizer=tokenizer)["inputs"][:,1:]
+    if "w_bl_output_attacked" in example:
+        w_bl_attacked_outputs = tokenize_and_truncate({"text":example["w_bl_output_attacked"]},
+                                            completion_length=0,
+                                            hf_model_name=hf_model_name,
+                                            tokenizer=tokenizer)["inputs"][:,1:]
+    else:
+        w_bl_attacked_outputs = None
+    if limit_output_tokens > 0:
+        example["orig_baseline_completion"] = example["baseline_completion"]
+        example["orig_real_completion_length"] = example["real_completion_length"]
+        baseline_outputs = baseline_outputs[:,:limit_output_tokens]
+        example["real_completion_length"] = baseline_outputs.shape[1]
+        example["baseline_completion"] = tokenizer.batch_decode(baseline_outputs, skip_special_tokens=True)[0]
+        example["orig_no_bl_output"] = example["no_bl_output"]
+        example["orig_no_bl_num_tokens_generated"] = example["no_bl_num_tokens_generated"]
+        no_bl_outputs = no_bl_outputs[:,:limit_output_tokens]
+        example["no_bl_num_tokens_generated"] = no_bl_outputs.shape[1]
+        example["no_bl_output"] = tokenizer.batch_decode(no_bl_outputs, skip_special_tokens=True)[0]
+        example["orig_w_bl_output"] = example["w_bl_output"]
+        example["orig_w_bl_num_tokens_generated"] = example["w_bl_num_tokens_generated"]
+        w_bl_outputs = w_bl_outputs[:,:limit_output_tokens]
+        example["w_bl_num_tokens_generated"] = w_bl_outputs.shape[1]
+        example["w_bl_output"] = tokenizer.batch_decode(w_bl_outputs, skip_special_tokens=True)[0]
+        example["orig_spike_entropies"] = example["spike_entropies"]
+        example["spike_entropies"] = [example["spike_entropies"][0][:limit_output_tokens]]
+        if "w_bl_output_attacked" in example:
+            # raise NotImplementedError("Havent thought what to do yet for this")
+            example["orig_w_bl_output_attacked"] = example["w_bl_output_attacked"]
+            # example["orig_w_bl_attacked_num_tokens_generated"] = example["w_bl_attacked_num_tokens_generated"]
+            w_bl_attacked_outputs = w_bl_attacked_outputs[:,:limit_output_tokens]
+            example["w_bl_attacked_num_tokens_generated"] = w_bl_attacked_outputs.shape[1]
+            example["w_bl_output_attacked"] = tokenizer.batch_decode(w_bl_attacked_outputs, skip_special_tokens=True)[0]
+    # score the 3 sequence completions/outputs wrt to bl hits
+    result = score_sequence(inputs=inputs,
+                            outputs=baseline_outputs, # <-- real text completions
+                            initial_seed=initial_seed,
+                            dynamic_seed=dynamic_seed,
+                            bl_proportion=bl_proportion,
+                            tokenizer=tokenizer,
+                            use_cuda=use_cuda,
+                            record_hits=record_hits,
+                            debug=False)
+    if record_hits:
+        bl_hits, num_toks_gend, hit_list = result
+    else:
+        bl_hits, num_toks_gend = result
+    example.update({"baseline_num_toks_gend_eq_0":(num_toks_gend == 0)})
+    # if num_toks_gend < 0.99*example["real_completion_length"]: breakpoint()
+    # if len(hit_list) < 0.99*example["real_completion_length"]: breakpoint()
+    if num_toks_gend == 0:
+        # print("No tokens generated, odd, avoiding div by zero and returning -1's")
+        wl_frac = -1
+        bl_frac = -1
+    else:
+        wl_frac = (num_toks_gend-bl_hits)/num_toks_gend
+        bl_frac = bl_hits/num_toks_gend
+    baseline_stats = {
+        "baseline_whitelist_fraction": wl_frac,
+        "baseline_blacklist_fraction": bl_frac
+    }
+    example.update(baseline_stats)
+    if record_hits: example.update({"baseline_hit_list":hit_list})
+    result = score_sequence(inputs=inputs,
+                                            outputs=no_bl_outputs, # <-- non-blacklisted version
+                                            initial_seed=initial_seed,
+                                            dynamic_seed=dynamic_seed,
+                                            bl_proportion=bl_proportion,
+                                            tokenizer=tokenizer,
+                                            record_hits=record_hits,
+                                            debug=False)
+    if record_hits:
+        bl_hits, num_toks_gend, hit_list = result
+    else:
+        bl_hits, num_toks_gend = result
+    example.update({"no_bl_num_toks_gend_eq_0":(num_toks_gend == 0)})
+    # if num_toks_gend < 0.99*example["no_bl_num_tokens_generated"]: breakpoint()
+    # if len(hit_list) < 0.99*example["no_bl_num_tokens_generated"]: breakpoint()
+    if num_toks_gend == 0:
+        # print("No tokens generated, odd, avoiding div by zero and returning -1's")
+        wl_frac = -1
+        bl_frac = -1
+    else:
+        wl_frac = (num_toks_gend-bl_hits)/num_toks_gend
+        bl_frac = bl_hits/num_toks_gend
+    no_bl_stats = {
+        "no_bl_whitelist_fraction": wl_frac,
+        "no_bl_blacklist_fraction": bl_frac
+    }
+    example.update(no_bl_stats)
+    if record_hits: example.update({"no_bl_hit_list":hit_list})
+    result = score_sequence(inputs=inputs,
+                                            outputs=w_bl_outputs, # <-- blacklisted version
+                                            initial_seed=initial_seed,
+                                            dynamic_seed=dynamic_seed,
+                                            bl_proportion=bl_proportion,
+                                            tokenizer=tokenizer,
+                                            record_hits=record_hits,
+                                            # breakpoint_on_hit=True, # banging head against wall
+                                            debug=False)
+    if record_hits:
+        bl_hits, num_toks_gend, hit_list = result
+    else:
+        bl_hits, num_toks_gend = result
+    example.update({"w_bl_num_toks_gend_eq_0":(num_toks_gend == 0)})
+    # if num_toks_gend < 0.99*example["w_bl_num_tokens_generated"]: breakpoint()
+    # if len(hit_list) < 0.99*example["w_bl_num_tokens_generated"]: breakpoint()
+    if num_toks_gend == 0:
+        # print("No tokens generated, odd, avoiding div by zero and returning -1's")
+        wl_frac = -1
+        bl_frac = -1
+    else:
+        wl_frac = (num_toks_gend-bl_hits)/num_toks_gend
+        bl_frac = bl_hits/num_toks_gend
+    w_bl_stats = {
+        "w_bl_whitelist_fraction": wl_frac,
+        "w_bl_blacklist_fraction": bl_frac
+    }
+    example.update(w_bl_stats)
+    if record_hits: example.update({"w_bl_hit_list":hit_list})
+    if w_bl_attacked_outputs is not None:
+        result = score_sequence(inputs=inputs,
+                                outputs=w_bl_attacked_outputs, # <-- blacklisted but attacked version
+                                initial_seed=initial_seed,
+                                dynamic_seed=dynamic_seed,
+                                bl_proportion=bl_proportion,
+                                tokenizer=tokenizer,
+                                record_hits=record_hits,
+                                # breakpoint_on_hit=True, # banging head against wall
+                                debug=False)
+        if record_hits:
+            bl_hits, num_toks_gend, hit_list = result
+        else:
+            bl_hits, num_toks_gend = result
+        example.update({"w_bl_attacked_num_toks_gend_eq_0":(num_toks_gend == 0)})
+        # if (num_toks_gend-bl_hits)/(num_toks_gend) < 1.0: breakpoint()
+        if num_toks_gend == 0:
+            # print("No tokens generated, odd, avoiding div by zero and returning -1's")
+            wl_frac = -1
+            bl_frac = -1
+        else:
+            wl_frac = (num_toks_gend-bl_hits)/num_toks_gend
+            bl_frac = bl_hits/num_toks_gend
+        w_bl_attacked_stats = {
+            "w_bl_attacked_num_tokens_generated": num_toks_gend,
+            "w_bl_attacked_whitelist_fraction": wl_frac,
+            "w_bl_attacked_blacklist_fraction": bl_frac
+        }
+        example.update(w_bl_attacked_stats)
+        if record_hits: example.update({"w_bl_attacked_hit_list":hit_list})
+    return example
+def aggregate_bl_stats(example: dict, idx: int, stat_table: dict):
+    stat_table["baseline_stats"]["whitelist_fraction"] += example["baseline_stats"]["whitelist_fraction"]
+    stat_table["baseline_stats"]["blacklist_fraction"] += example["baseline_stats"]["blacklist_fraction"]
+    stat_table["w_bl_stats"]["whitelist_fraction"] += example["w_bl_stats"]["whitelist_fraction"]
+    stat_table["w_bl_stats"]["blacklist_fraction"] += example["w_bl_stats"]["blacklist_fraction"]
+    stat_table["no_bl_stats"]["whitelist_fraction"] += example["no_bl_stats"]["whitelist_fraction"]
+    stat_table["no_bl_stats"]["blacklist_fraction"] += example["no_bl_stats"]["blacklist_fraction"]
+    stat_table["num_examples"] += 1
+    return example
+def compute_ppl_single(prefix_and_output_text = None,
+                        output_text = None,
+                        oracle_model_name = None,
+                        oracle_model = None,
+                        oracle_tokenizer = None):
+    with torch.no_grad():
+        tokd_prefix = tokenize_and_truncate({"text":prefix_and_output_text}, completion_length=0, hf_model_name=oracle_model_name, tokenizer=oracle_tokenizer, model_max_seq_len=oracle_model.config.max_position_embeddings)["inputs"]
+        tokd_inputs = tokd_prefix
+        # if only want to score the "generation" part we need the suffix tokenization length
+        tokd_suffix = tokenize_and_truncate({"text":output_text}, completion_length=0, hf_model_name=oracle_model_name, tokenizer=oracle_tokenizer, model_max_seq_len=oracle_model.config.max_position_embeddings)["inputs"]
+        tokd_inputs = tokd_inputs.to(oracle_model.device)
+        # make labels, mark if not including all positions
+        tokd_labels = tokd_inputs.clone().detach()
+        tokd_labels[:,:tokd_labels.shape[1]-tokd_suffix.shape[1]+1] = -100
+        outputs = oracle_model(input_ids=tokd_inputs, labels=tokd_labels)
+        loss = outputs.loss # avg CE loss all positions (except -100, TODO plz check that this is working correctly)
+        ppl = torch.tensor(math.exp(loss))
+    return loss.item(), ppl.item()
+def evaluate_generation_fluency(example: dict,
+                                idx: int,
+                                oracle_model_name = None,
+                                oracle_model = None,
+                                oracle_tokenizer = None):
+    # pull out the required fields from the pipeline results
+    inputs_plus_baseline_output = f"{example['truncated_input']}{example['baseline_completion']}"
+    baseline_output = f"{example['baseline_completion']}"
+    inputs_plus_no_bl_output = f"{example['truncated_input']}{example['no_bl_output']}"
+    no_bl_output = f"{example['no_bl_output']}"
+    inputs_plus_w_bl_output = f"{example['truncated_input']}{example['w_bl_output']}"
+    w_bl_output = f"{example['w_bl_output']}"
+    # add metrics
+    loss, ppl = compute_ppl_single(inputs_plus_baseline_output, baseline_output, oracle_model_name, oracle_model, oracle_tokenizer)
+    example["baseline_loss"] = loss
+    example["baseline_ppl"] = ppl
+    loss, ppl = compute_ppl_single(inputs_plus_no_bl_output, no_bl_output, oracle_model_name, oracle_model, oracle_tokenizer)
+    example["no_bl_loss"] = loss
+    example["no_bl_ppl"] = ppl
+    loss, ppl = compute_ppl_single(inputs_plus_w_bl_output, w_bl_output, oracle_model_name, oracle_model, oracle_tokenizer)
+    example["w_bl_loss"] = loss
+    example["w_bl_ppl"] = ppl
+    # del any temp values
+    return example
+def add_idx(example,idx):
+    example.update({"idx":idx})
+    return example
+def check_input_lengths(example,idx, min_sample_len=0, min_prompt_len=0, min_completion_len=0):
+    orig_sample_length = example["orig_sample_length"]
+    prompt_length = example["prompt_length"]
+    real_completion_length = example["real_completion_length"]
+    # breakpoint()
+    conds = all([
+        orig_sample_length >= min_sample_len,
+        prompt_length >= min_prompt_len,
+        real_completion_length >= min_completion_len,
+    ])
+    return conds
+def check_output_lengths(example,min_output_len=0):
+    no_bl_output_len = example["no_bl_num_tokens_generated"]
+    w_bl_output_len = example["w_bl_num_tokens_generated"]
+    conds = all([
+        no_bl_output_len >= min_output_len,
+        w_bl_output_len >= min_output_len,
+    ])
+    return conds

lm-watermarking-main/experiments/watermarking_analysis.ipynb ADDED Viewed

	@@ -0,0 +1,2049 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Watermark Analysis\n",
+    "\n",
+    "Notebook for performing analysis and visualization of the effects of watermarking schemes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Basic imports\n",
+    "import os\n",
+    "\n",
+    "from tqdm import tqdm\n",
+    "from statistics import mean\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import torch\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "from matplotlib import rc\n",
+    "rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})\n",
+    "rc('text', usetex=True)\n",
+    "\n",
+    "import cmasher as cmr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_from_disk"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load the processed dataset/frame"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save_name = \"analysis_ds_1-19_realnews_1-3_v1\" # in figure\n",
+    "# save_name = \"analysis_ds_1-21_greedy_redo\" \n",
+    "# save_name = \"analysis_ds_1-21_greedy_redo_truncated\"\n",
+    "# save_name = \"analysis_ds_1-23_greedy_gamma_0-25_truncated\" \n",
+    "# save_name = \"analysis_ds_1-23_greedy_gamma_0-25_0-5_truncated\" # in figure (not 100% sure this is correct, check)\n",
+    "\n",
+    "# save_name = \"analysis_ds_1-20_more_attack\" # in figure\n",
+    "\n",
+    "# save_name = \"analysis_ds_1-19_realnews_1-3_v1\" # in figure\n",
+    "# save_name = \"analysis_ds_1-23_en_1-3\"\n",
+    "save_name = \"analysis_ds_1-23_pile_1-3\"\n",
+    "\n",
+    "save_dir = f\"input/{save_name}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_data = load_from_disk(save_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### convert to pandas df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = raw_data.to_pandas()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"Orig number of rows: {len(df)}\")\n",
+    "df.tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.columns"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### \"retokenization\" problem \n",
+    "\n",
+    "current hypo for what matches this criterion is based on the non 1-to-1 aspect of tokenization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "retok_problematic_rows = df[(df['w_bl_whitelist_fraction'] != -1.0) & (df['w_bl_whitelist_fraction'] != 1.0) & (df['bl_type'] == 'hard')]\n",
+    "print(f\"Num rows that are hard-blacklisted, and measureable, but still have a non-100% WL fraction: {len(retok_problematic_rows)} out of {len(df[df['bl_type'] == 'hard'])}\")\n",
+    "# retok_problematic_rows"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Replace or drop the the specially marked -1 rows since these are unmeasureable due to short length"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "orig_len = len(df)\n",
+    "\n",
+    "# df['no_bl_whitelist_fraction'].mask(df['no_bl_whitelist_fraction'] == -1.0, pd.NA, inplace=True)\n",
+    "# df['w_bl_whitelist_fraction'].mask(df['w_bl_whitelist_fraction'] == -1.0, pd.NA, inplace=True)\n",
+    "\n",
+    "df = df[df[\"no_bl_whitelist_fraction\"] != -1.0]\n",
+    "df = df[df[\"w_bl_whitelist_fraction\"] != -1.0]\n",
+    "\n",
+    "print(f\"Dropped {orig_len-len(df)} rows, new len {len(df)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Drop rows where there weren't enough tokens to measure ppl in one or both of the output cases"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "orig_len = len(df)\n",
+    "# df = df[df[\"no_bl_ppl\"].isna()]\n",
+    "# df = df[df[\"w_bl_ppl\"].isna()]\n",
+    "df = df[~(df[\"no_bl_ppl\"].isna() | df[\"w_bl_ppl\"].isna())]\n",
+    "print(f\"Dropped {orig_len-len(df)} rows, new len {len(df)}\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### drop rows with really large bias, as 100.0 is $\\simeq \\infty$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "orig_len = len(df)\n",
+    "\n",
+    "df = df[df[\"bl_logit_bias\"] <= 100.0]\n",
+    "\n",
+    "print(f\"Dropped {orig_len-len(df)} rows, new len {len(df)}\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### drop rows where using sampling but also beam search, not considering at this time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "orig_len = len(df)\n",
+    "\n",
+    "# df = df[df[\"bl_hparams\"].apply(lambda tup: (tup[0] == False and tup[2] != 1) or (tup[0] == True and tup[2] == 1) or (tup[0] == False))]\n",
+    "df = df[((df[\"use_sampling\"]==True) & (df[\"num_beams\"] == 1)) | (df[\"use_sampling\"]==False)]\n",
+    "\n",
+    "print(f\"Dropped {orig_len-len(df)} rows, new len {len(df)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### correct the sampling temp column"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.loc[df[\"use_sampling\"]==False,\"sampling_temp\"] = df.loc[df[\"use_sampling\"]==False,\"sampling_temp\"].fillna(0.0)\n",
+    "df.loc[df[\"use_sampling\"]==True,\"sampling_temp\"] = df.loc[df[\"use_sampling\"]==True,\"sampling_temp\"].fillna(1.0)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### marking the hard blacklist rows as having inf/very large bias\n",
+    "\n",
+    "(after the > 100.0 bias drop)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.loc[df[\"bl_type\"]==\"hard\",\"bl_logit_bias\"] = np.inf\n",
+    "# df.loc[df[\"bl_type\"]==\"hard\",\"bl_logit_bias\"] = 10000 # crosscheck with whats hardcoded in the bl processor"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Rename some parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[\"delta\"] = df[\"bl_logit_bias\"].values\n",
+    "df[\"gamma\"] = 1 - df[\"bl_proportion\"].values\n",
+    "df[\"gamma\"] = df[\"gamma\"].round(3)\n",
+    "\n",
+    "df[\"no_bl_act_num_wl_tokens\"] = np.round(df[\"no_bl_whitelist_fraction\"].values*df[\"no_bl_num_tokens_generated\"],1) # round to 1 for sanity\n",
+    "df[\"w_bl_act_num_wl_tokens\"] = np.round(df[\"w_bl_whitelist_fraction\"].values*df[\"w_bl_num_tokens_generated\"],1) # round to 1 for sanity\n",
+    "\n",
+    "df[\"w_bl_std_num_wl_tokens\"] = np.sqrt(df[\"w_bl_var_num_wl_tokens\"].values)\n",
+    "\n",
+    "if \"real_completion_length\":\n",
+    "    df[\"baseline_num_tokens_generated\"] = df[\"real_completion_length\"].values\n",
+    "\n",
+    "if \"actual_attacked_ratio\" in df.columns:\n",
+    "    df[\"actual_attacked_fraction\"] = df[\"actual_attacked_ratio\"].values*df[\"replace_ratio\"].values\n",
+    "\n",
+    "if \"meta\" in df.columns:\n",
+    "    df[\"pile_set_name\"] = df[\"meta\"].apply(lambda dict: dict[\"pile_set_name\"])\n",
+    "\n",
+    "df[\"baseline_hit_list_length\"] = df[\"baseline_hit_list\"].apply(len)\n",
+    "df[\"no_bl_hit_list_length\"] = df[\"no_bl_hit_list\"].apply(len)\n",
+    "df[\"w_bl_hit_list_length\"] = df[\"w_bl_hit_list\"].apply(len)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for pile outlier filtering\n",
+    "df[\"w_bl_space_count\"] = df[\"w_bl_output\"].apply(lambda string: string.count(\" \"))\n",
+    "df[\"no_bl_space_count\"] = df[\"no_bl_output\"].apply(lambda string: string.count(\" \"))\n",
+    "df[\"baseline_space_count\"] = df[\"baseline_completion\"].apply(lambda string: string.count(\" \"))\n",
+    "\n",
+    "df[\"w_bl_space_frac\"] = df[\"w_bl_space_count\"].values / df[\"w_bl_hit_list_length\"]\n",
+    "df[\"no_bl_space_frac\"] = df[\"no_bl_space_count\"].values / df[\"no_bl_hit_list_length\"]\n",
+    "df[\"baseline_space_frac\"] = df[\"baseline_space_count\"].values / df[\"baseline_hit_list_length\"]"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Filter for the generation lengths we want to look at"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "orig_len = len(df)\n",
+    "\n",
+    "# # main filters\n",
+    "# # df = df[(df[\"real_completion_length\"] == 200) & (df[\"w_bl_num_tokens_generated\"] == 200)]\n",
+    "# df = df[(df[\"gamma\"] == 0.1) | (df[\"gamma\"] == 0.25) | (df[\"gamma\"] == 0.5)]\n",
+    "# df = df[(df[\"delta\"] == 1.0) | (df[\"delta\"] == 2.0) | (df[\"delta\"] == 10.0)]\n",
+    "# df = df[(df[\"use_sampling\"] == True)]\n",
+    "# df = df[(df[\"bl_type\"] == \"soft\")]\n",
+    "\n",
+    "# df = df[(df[\"real_completion_length\"] == 200) & (df[\"no_bl_num_tokens_generated\"] == 200) & (df[\"w_bl_num_tokens_generated\"] == 200)] # now also applies to the truncated version\n",
+    "# df = df[(df[\"no_bl_num_tokens_generated\"] >= 500) & (df[\"w_bl_num_tokens_generated\"] >= 500)] # all gas noop\n",
+    "\n",
+    "# # # attack specific\n",
+    "# df = df[(df[\"real_completion_length\"] == 200) & (df[\"no_bl_num_tokens_generated\"] == 200) & (df[\"w_bl_num_tokens_generated\"] == 200)]\n",
+    "# df = df[(df[\"replace_ratio\"] <= 0.7)]\n",
+    "\n",
+    "# NOTE pile only\n",
+    "df = df[df[\"w_bl_space_frac\"] <= 0.9]\n",
+    "df = df[df[\"no_bl_space_frac\"] <= 0.9]\n",
+    "# df = df[df[\"pile_set_name\"] != \"Github\"]\n",
+    "\n",
+    "upper_T = 205\n",
+    "lower_T = 195\n",
+    "df = df[(df[\"baseline_hit_list_length\"] >= lower_T) & (df[\"no_bl_hit_list_length\"] >= lower_T) & (df[\"w_bl_hit_list_length\"] >= lower_T)] # now also applies to the truncated version\n",
+    "df = df[(df[\"baseline_hit_list_length\"] <= upper_T) & (df[\"no_bl_hit_list_length\"] <= upper_T) & (df[\"w_bl_hit_list_length\"] <= upper_T)] # now also applies to the truncated version\n",
+    "\n",
+    "\n",
+    "print(f\"Dropped {orig_len-len(df)} rows, new len {len(df)}\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Add z-scores (convert the raw watermark measurement, fraction, to a z-score )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from math import sqrt\n",
+    "import scipy.stats\n",
+    "def compute_z_score(observed_wl_frac, T, gamma):\n",
+    "    numer = observed_wl_frac - gamma\n",
+    "    denom = sqrt(gamma*(1-gamma)/T)\n",
+    "    z = numer/denom\n",
+    "    return z\n",
+    "\n",
+    "def compute_wl_for_z(z, T, gamma):\n",
+    "    denom = sqrt(gamma*(1-gamma)/T)\n",
+    "    numer = ((z*denom)+gamma)*T\n",
+    "    return numer\n",
+    "\n",
+    "def compute_p_value(z):\n",
+    "    p_value = scipy.stats.norm.sf(abs(z))\n",
+    "    return p_value\n",
+    "\n",
+    "df[\"baseline_z_score\"] = df[[\"baseline_whitelist_fraction\", \"baseline_num_tokens_generated\", \"gamma\"]].apply(lambda tup: compute_z_score(*tup), axis=1)\n",
+    "df[\"no_bl_z_score\"] = df[[\"no_bl_whitelist_fraction\", \"no_bl_num_tokens_generated\", \"gamma\"]].apply(lambda tup: compute_z_score(*tup), axis=1)\n",
+    "df[\"w_bl_z_score\"] = df[[\"w_bl_whitelist_fraction\", \"w_bl_num_tokens_generated\", \"gamma\"]].apply(lambda tup: compute_z_score(*tup), axis=1)\n",
+    "\n",
+    "if \"w_bl_attacked_whitelist_fraction\" in df.columns:\n",
+    "    df[\"w_bl_attacked_z_score\"] = df[[\"w_bl_attacked_whitelist_fraction\", \"w_bl_attacked_num_tokens_generated\", \"gamma\"]].apply(lambda tup: compute_z_score(*tup), axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# if attacked in df\n",
+    "if \"w_bl_attacked_whitelist_fraction\" in df.columns:\n",
+    "    df[\"w_bl_attacked_act_num_wl_tokens\"] = np.round(df[\"w_bl_attacked_whitelist_fraction\"].values*df[\"w_bl_attacked_num_tokens_generated\"],1) # round to 1 for sanity\n",
+    "\n",
+    "    df[\"w_bl_attacked_z_score\"] = df[[\"w_bl_attacked_whitelist_fraction\", \"w_bl_attacked_num_tokens_generated\", \"gamma\"]].apply(lambda tup: compute_z_score(*tup), axis=1)\n",
+    "\n",
+    "    df[[\"bl_proportion\",\"w_bl_attacked_whitelist_fraction\", \"w_bl_attacked_num_tokens_generated\",\"w_bl_attacked_act_num_wl_tokens\", \"w_bl_attacked_z_score\"]]"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare groupby (decide which hyperparameters to groups the rows by)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# groupby_fields = ['num_beams', 'max_new_tokens']\n",
+    "# groupby_fields = ['use_sampling','num_beams', 'max_new_tokens']\n",
+    "# groupby_fields = ['use_sampling','num_beams', 'max_new_tokens', 'bl_logit_bias']\n",
+    "# groupby_fields = ['use_sampling','num_beams', 'max_new_tokens', 'bl_type','bl_logit_bias']\n",
+    "# groupby_fields = ['use_sampling','sampling_temp','num_beams', 'max_new_tokens', 'bl_type','bl_logit_bias']\n",
+    "# groupby_fields = ['use_sampling','sampling_temp','num_beams', 'max_new_tokens', 'bl_type','bl_logit_bias','bl_proportion']\n",
+    "# groupby_fields = ['use_sampling','num_beams','bl_type','bl_logit_bias','bl_proportion']\n",
+    "\n",
+    "if \"w_bl_attacked_whitelist_fraction\" in df.columns: \n",
+    "    groupby_fields = ['use_sampling','num_beams','gamma','delta', 'replace_ratio'] # attack grouping\n",
+    "else:\n",
+    "    groupby_fields = ['use_sampling','num_beams','delta','gamma'] # regular grouping\n",
+    "    # groupby_fields = ['use_sampling','delta','gamma'] # regular grouping, but no beam variation\n",
+    "    # groupby_fields = ['delta','gamma'] # regular grouping, but no beam variation, and all sampling"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### narrowing in on IQ range (not generally used)\n",
+    "\n",
+    "(removing outliers by subsetting to rows near the mean etc.)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# tmp_grped_25 = df.groupby(groupby_fields, as_index= False)['avg_spike_entropy'].quantile(q=0.25).rename(columns={'avg_spike_entropy': 'avg_spike_entropy_25th'})\n",
+    "# tmp_grped_50 = df.groupby(groupby_fields, as_index= False)['avg_spike_entropy'].quantile(q=0.5).rename(columns={'avg_spike_entropy': 'avg_spike_entropy_50th'})\n",
+    "# tmp_grped_75 = df.groupby(groupby_fields, as_index= False)['avg_spike_entropy'].quantile(q=0.75).rename(columns={'avg_spike_entropy': 'avg_spike_entropy_75th'})\n",
+    "# df = df.merge(tmp_grped_25, on = groupby_fields)\n",
+    "# df = df.merge(tmp_grped_50, on = groupby_fields)\n",
+    "# df = df.merge(tmp_grped_75, on = groupby_fields)\n",
+    "\n",
+    "# # tmp_grped_mean = df.groupby(groupby_fields, as_index= False)['avg_spike_entropy'].mean().rename(columns={'avg_spike_entropy': 'avg_spike_entropy_mean'})\n",
+    "# # tmp_grped_median = df.groupby(groupby_fields, as_index= False)['avg_spike_entropy'].median().rename(columns={'avg_spike_entropy': 'avg_spike_entropy_median'})\n",
+    "# # df = df.merge(tmp_grped_mean, on = groupby_fields)\n",
+    "# # df = df.merge(tmp_grped_median, on = groupby_fields)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # eps = 0.001\n",
+    "# eps = 0.005\n",
+    "# df[\"avg_spike_entropy_mean_minus_eps\"] = df['avg_spike_entropy_mean']-eps\n",
+    "# df[\"avg_spike_entropy_mean_plus_eps\"] = df['avg_spike_entropy_mean']+eps\n",
+    "\n",
+    "# df[\"avg_spike_entropy_median_minus_eps\"] = df['avg_spike_entropy_median']-eps\n",
+    "# df[\"avg_spike_entropy_median_plus_eps\"] = df['avg_spike_entropy_median']+eps\n",
+    "# print(df.columns)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # df[[\"avg_spike_entropy_25th\",\"avg_spike_entropy_75th\"]]\n",
+    "# df[[\"avg_spike_entropy_mean_minus_eps\",\"avg_spike_entropy_mean\",\"avg_spike_entropy_mean_plus_eps\"]]\n",
+    "# df[[\"avg_spike_entropy_median_minus_eps\",\"avg_spike_entropy_median\",\"avg_spike_entropy_median_plus_eps\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# orig_len = len(df)\n",
+    "\n",
+    "# subdf = df[(df[\"avg_spike_entropy\"] >= df[\"avg_spike_entropy_25th\"]) & (df[\"avg_spike_entropy\"] <= df[\"avg_spike_entropy_75th\"])]\n",
+    "\n",
+    "# # subdf = df[(df[\"avg_spike_entropy\"] >= df[\"avg_spike_entropy_mean_minus_eps\"]) & (df[\"avg_spike_entropy\"] <= df[\"avg_spike_entropy_mean_plus_eps\"])]\n",
+    "# # subdf = df[(df[\"avg_spike_entropy\"] >= df[\"avg_spike_entropy_mean_minus_eps\"]) & (df[\"avg_spike_entropy\"] <= df[\"avg_spike_entropy_mean_plus_eps\"])]\n",
+    "\n",
+    "# print(f\"Dropped {orig_len-len(subdf)} rows, new len {len(subdf)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# subdf.groupby(groupby_fields)['avg_spike_entropy'].describe()\n",
+    "# df.groupby(groupby_fields)['avg_spike_entropy'].describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df = subdf"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Perform the groupby (group rows by their hyperparameter settings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "grouped_df = df.groupby(groupby_fields)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"Number of rows after filtering: {len(df)}\")\n",
+    "print(f\"Number of groups: {len(grouped_df)}\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loop to compute \"confusion matrix\" (TPR,FPR etc.) at some z scores for tabulation (Table 2 & 8)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sklearn.metrics as metrics\n",
+    "\n",
+    "def reject_null_hypo(z_score=None,cuttoff=None):\n",
+    "    return z_score > cuttoff\n",
+    "\n",
+    "records = []\n",
+    "\n",
+    "for group_params in tqdm(list(grouped_df.groups.keys())):\n",
+    "    sub_df = grouped_df.get_group(group_params)\n",
+    "    grp_size = len(sub_df)\n",
+    "\n",
+    "    # baseline_z_scores = sub_df[\"baseline_z_score\"].values\n",
+    "    # w_bl_z_scores = sub_df[\"w_bl_z_score\"].values\n",
+    "    # all_scores = np.concatenate([baseline_z_scores,w_bl_z_scores])\n",
+    "\n",
+    "    # baseline_labels = np.zeros_like(baseline_z_scores)\n",
+    "    # attacked_labels = np.ones_like(w_bl_z_scores)\n",
+    "    # all_labels = np.concatenate([baseline_labels,attacked_labels])\n",
+    "\n",
+    "    # fpr, tpr, thresholds = metrics.roc_curve(all_labels, all_scores, pos_label=1)\n",
+    "    # roc_auc = metrics.auc(fpr, tpr)\n",
+    "    record = {k:v for k,v in zip(groupby_fields,group_params)}\n",
+    "\n",
+    "    for thresh in [4.0,5.0]:\n",
+    "        \n",
+    "        record[\"count\"] = grp_size\n",
+    "        record[f\"baseline_fpr_at_{thresh}\"] = reject_null_hypo(z_score=sub_df[\"baseline_z_score\"].values,cuttoff=thresh).sum() / grp_size\n",
+    "        record[f\"baseline_tnr_at_{thresh}\"] = (~reject_null_hypo(z_score=sub_df[\"baseline_z_score\"],cuttoff=thresh)).sum() / grp_size\n",
+    "        record[f\"no_bl_fpr_at_{thresh}\"] = reject_null_hypo(z_score=sub_df[\"no_bl_z_score\"].values,cuttoff=thresh).sum() / grp_size\n",
+    "        record[f\"no_bl_tnr_at_{thresh}\"] = (~reject_null_hypo(z_score=sub_df[\"no_bl_z_score\"].values,cuttoff=thresh)).sum() / grp_size\n",
+    "        record[f\"w_bl_tpr_at_{thresh}\"] = reject_null_hypo(z_score=sub_df[\"w_bl_z_score\"].values,cuttoff=thresh).sum() / grp_size\n",
+    "        record[f\"w_bl_fnr_at_{thresh}\"] = (~reject_null_hypo(z_score=sub_df[\"w_bl_z_score\"].values,cuttoff=thresh)).sum() / grp_size\n",
+    "\n",
+    "        if \"w_bl_attacked_z_score\" in sub_df.columns:\n",
+    "            record[f\"w_bl_attacked_tpr_at_{thresh}\"] = reject_null_hypo(z_score=sub_df[\"w_bl_attacked_z_score\"].values,cuttoff=thresh).sum() / grp_size\n",
+    "            record[f\"w_bl_attacked_fnr_at_{thresh}\"] = (~reject_null_hypo(z_score=sub_df[\"w_bl_attacked_z_score\"].values,cuttoff=thresh)).sum() / grp_size\n",
+    "\n",
+    "    records.append(record)\n",
+    "\n",
+    "    #     # df[f\"baseline_fp_at_{thresh}\"] = reject_null_hypo(z_score=df[\"baseline_z_score\"].values,cuttoff=thresh)\n",
+    "    #     # df[f\"baseline_tn_at_{thresh}\"] = ~reject_null_hypo(z_score=df[\"baseline_z_score\"],cuttoff=thresh)\n",
+    "    #     # df[f\"no_bl_fp_at_{thresh}\"] = reject_null_hypo(z_score=df[\"no_bl_z_score\"].values,cuttoff=thresh)\n",
+    "    #     # df[f\"no_bl_tn_at_{thresh}\"] = ~reject_null_hypo(z_score=df[\"no_bl_z_score\"].values,cuttoff=thresh)\n",
+    "    #     # df[f\"w_bl_tp_at_{thresh}\"] = reject_null_hypo(z_score=df[\"w_bl_z_score\"].values,cuttoff=thresh)\n",
+    "    #     # df[f\"w_bl_fn_at_{thresh}\"] = ~reject_null_hypo(z_score=df[\"w_bl_z_score\"].values,cuttoff=thresh)\n",
+    "\n",
+    "\n",
+    "roc_df = pd.DataFrame.from_records(records)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# thresh = 6.0\n",
+    "# thresh = 5.0\n",
+    "std_threshes = [4.0, 5.0] #, 6.0]\n",
+    "# std_threshes = [4.0]\n",
+    "\n",
+    "# roc_df[\"params\"] = roc_df.index.to_list()\n",
+    "\n",
+    "# columns = [\"num_beams\", \"delta\", \"gamma\", \"count\"]\n",
+    "# columns = [\"delta\", \"gamma\", \"count\"]\n",
+    "columns = [\"use_sampling\",\"delta\", \"gamma\", \"count\"]\n",
+    "# columns = [\"use_sampling\", \"replace_ratio\", \"count\"]\n",
+    "\n",
+    "for thresh in std_threshes:\n",
+    "    # columns += [f\"baseline_fpr_at_{thresh}\",f\"no_bl_fpr_at_{thresh}\",f\"w_bl_tpr_at_{thresh}\"]\n",
+    "    # columns += [f\"baseline_fpr_at_{thresh}\",f\"baseline_tnr_at_{thresh}\",f\"no_bl_fpr_at_{thresh}\",f\"no_bl_tnr_at_{thresh}\",f\"w_bl_tpr_at_{thresh}\",f\"w_bl_fn_at_{thresh}\"]\n",
+    "\n",
+    "\n",
+    "    # columns += [f\"baseline_fpr_at_{thresh}\",f\"baseline_tnr_at_{thresh}\",f\"w_bl_tpr_at_{thresh}\",f\"w_bl_fnr_at_{thresh}\"]\n",
+    "    \n",
+    "    if f\"w_bl_attacked_fnr_at_{thresh}\" in roc_df.columns:\n",
+    "        columns += [f\"w_bl_tpr_at_{thresh}\",f\"w_bl_fnr_at_{thresh}\"]\n",
+    "        columns += [f\"w_bl_attacked_tpr_at_{thresh}\",f\"w_bl_attacked_fnr_at_{thresh}\"] # if attack\n",
+    "    else:\n",
+    "        columns += [f\"baseline_fpr_at_{thresh}\",f\"baseline_tnr_at_{thresh}\",f\"w_bl_tpr_at_{thresh}\",f\"w_bl_fnr_at_{thresh}\"]\n",
+    "\n",
+    "# filter ot not\n",
+    "sub_df = roc_df[(roc_df[\"use_sampling\"] == True) & ((roc_df[\"delta\"] == 1.0) | (roc_df[\"delta\"] == 2.0) | (roc_df[\"delta\"] == 5.0))  &  ((roc_df[\"gamma\"] == 0.25) |(roc_df[\"gamma\"] == 0.5) )]\n",
+    "# sub_df = roc_df[(roc_df[\"use_sampling\"] == False) & ((roc_df[\"delta\"] == 1.0) | (roc_df[\"delta\"] == 2.0) | (roc_df[\"delta\"] == 5.0))  &  ((roc_df[\"gamma\"] == 0.25) |(roc_df[\"gamma\"] == 0.5) ) & (roc_df[\"num_beams\"] == 8)]\n",
+    "# sub_df = roc_df[(roc_df[\"replace_ratio\"] == 0.1) | (roc_df[\"replace_ratio\"] == 0.3) | (roc_df[\"replace_ratio\"] == 0.5)  | (roc_df[\"replace_ratio\"] == 0.7)]\n",
+    "# sub_df = roc_df[(roc_df[\"num_beams\"] == 8)]\n",
+    "# sub_df = roc_df\n",
+    "\n",
+    "# sub_df.sort_values(\"delta\")[columns]\n",
+    "# sub_df.sort_values(\"num_beams\")[columns]\n",
+    "sub_df.sort_values(by=[\"delta\",\"gamma\"],ascending=[True, False])[columns]"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### write tables to latex"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print(roc_df[columns].drop([\"count\"],axis=1).sort_values(\"gamma\").round(3).to_latex(index=False))\n",
+    "# print(roc_df[columns].drop([\"count\"],axis=1).sort_values(\"delta\").round(3).to_latex(index=False))\n",
+    "# print(roc_df[columns].drop([\"count\"],axis=1).sort_values(\"num_beams\").round(3).to_latex(index=False))\n",
+    "\n",
+    "# print(sub_df.sort_values(by=[\"delta\",\"gamma\"],ascending=[True, False])[columns].round(3).to_latex(index=False))\n",
+    "# print(sub_df.sort_values(\"num_beams\")[columns].round(3).to_latex(index=False))"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ROC: No Attack (figure 4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.clf()\n",
+    "plt.figure(constrained_layout=True)\n",
+    "plt.figure(figsize=(5, 4))\n",
+    "\n",
+    "import sklearn.metrics as metrics\n",
+    "\n",
+    "zoom = False\n",
+    "# zoom = True\n",
+    "\n",
+    "beam_search = None\n",
+    "# beam_search = 1\n",
+    "# beam_search = 4\n",
+    "# beam_search = 8\n",
+    "\n",
+    "deltas = [1.0,2.0,5.0,10.0]\n",
+    "# gammas = [0.25, 0.5]\n",
+    "gammas = [0.25]\n",
+    "# gammas = [0.5]\n",
+    "\n",
+    "# deltas = [1.0,2.0,5.0,10.0]\n",
+    "# gammas = [0.1,0.5]\n",
+    "\n",
+    "groups = []\n",
+    "names = []\n",
+    "for d in deltas:\n",
+    "    for g in gammas:\n",
+    "        if beam_search:\n",
+    "            groups.append((False, beam_search, d, g))\n",
+    "        else:\n",
+    "            groups.append((True, 1, d, g))\n",
+    "        names.append(f\"$\\delta:{d},\\gamma:{g}$\")\n",
+    "groups=groups[::-1]\n",
+    "names=names[::-1]\n",
+    "\n",
+    "# Make colormap\n",
+    "import matplotlib.pyplot as plt\n",
+    "viridis = plt.colormaps['viridis'].resampled(len(groups)+1) \n",
+    "cmap = viridis.colors[:len(groups)][::-1]\n",
+    "\n",
+    "# plot different parameter levels\n",
+    "for i,(group,name) in enumerate(zip(groups,names)):\n",
+    "\n",
+    "    baseline_z_scores = grouped_df.get_group(group)[\"baseline_z_score\"].values\n",
+    "    w_bl_z_scores = grouped_df.get_group(group)[\"w_bl_z_score\"].values\n",
+    "    all_scores = np.concatenate([baseline_z_scores,w_bl_z_scores])\n",
+    "\n",
+    "    baseline_labels = np.zeros_like(baseline_z_scores)\n",
+    "    attacked_labels = np.ones_like(w_bl_z_scores)\n",
+    "    all_labels = np.concatenate([baseline_labels,attacked_labels])\n",
+    "\n",
+    "    fpr, tpr, thresholds = metrics.roc_curve(all_labels, all_scores, pos_label=1)\n",
+    "    roc_auc = metrics.auc(fpr, tpr)\n",
+    "\n",
+    "    plt.plot(fpr, tpr, color=cmap[i], label = f'{name}, AUC:%0.3f, PPL:{round(grouped_df[\"w_bl_ppl\"].describe().loc[group][\"mean\"],1)}' % roc_auc, linewidth=3)\n",
+    "\n",
+    "if \"w_bl_attacked_ppl\" in df.columns:\n",
+    "    pass\n",
+    "else:\n",
+    "    # # vanilla ppl value\n",
+    "    plt.scatter([-1],[-1],label=f'            $\\delta=0$, PPL: {round(grouped_df[\"no_bl_ppl\"].describe().loc[groups,\"mean\"].mean(),1)}', color=\"white\")\n",
+    "\n",
+    "if zoom:\n",
+    "    if not \"w_bl_attacked_ppl\" in df.columns:\n",
+    "        plt.legend(loc = 'lower right', fontsize = 12)\n",
+    "    plt.xscale(\"log\")\n",
+    "    # plt.yscale(\"log\")\n",
+    "    plt.xlim([0, 1])\n",
+    "    plt.ylim([0.5, 1])\n",
+    "    plot_name = (\"roc_auc_zoom\" if not beam_search else f\"roc_auc_zoom_greedy_beams_{beam_search}\")\n",
+    "\n",
+    "else:\n",
+    "    if \"w_bl_attacked_ppl\" in df.columns:\n",
+    "        plt.legend(loc = 'lower right', fontsize = 12)\n",
+    "    plt.plot([0, 1], [0, 1],'r--')\n",
+    "    plt.xlim([0, 1])\n",
+    "    plt.ylim([0, 1])\n",
+    "    plot_name = (\"roc_auc\" if not beam_search else f\"roc_auc_greedy_beams_{beam_search}\")\n",
+    "\n",
+    "plt.ylabel('True Positive Rate', fontsize = 12)\n",
+    "plt.xlabel('False Positive Rate', fontsize = 12)\n",
+    "\n",
+    "print(plot_name)\n",
+    "\n",
+    "# fname = f\"figs/{plot_name}.pdf\"\n",
+    "# plt.savefig(fname, format=\"pdf\")\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "# ROC: Attack (figure 6)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sklearn.metrics as metrics\n",
+    "\n",
+    "plt.clf()\n",
+    "plt.figure(constrained_layout=True)\n",
+    "plt.figure(figsize=(5, 4))\n",
+    "\n",
+    "# attack_budgets = [0.1,0.2,0.3,0.4,0.5,0.6,0.7]\n",
+    "attack_budgets = [0.1,0.3,0.5,0.7]\n",
+    "groups = [(True, 1, 0.5, 2.0, budget) for budget in attack_budgets]\n",
+    "beams = False\n",
+    "# groups = [(False, 8, 0.5, 2.0, budget) for budget in attack_budgets]\n",
+    "# beams = True\n",
+    "\n",
+    "names = [f\"$\\epsilon={eps}$\" for eps in attack_budgets]\n",
+    "\n",
+    "# Make colormap\n",
+    "import matplotlib.pyplot as plt\n",
+    "viridis = plt.colormaps['viridis'].resampled(len(groups)+1+1) # attack\n",
+    "cmap = viridis.colors[:len(groups)+1][::-1]\n",
+    "\n",
+    "# plot original\n",
+    "group = groups[0] # any will do\n",
+    "baseline_z_scores = grouped_df.get_group(group)[\"baseline_z_score\"].values\n",
+    "baseline_labels = np.zeros_like(baseline_z_scores)\n",
+    "\n",
+    "orig_watermark_z_scores = grouped_df.get_group(group)[\"w_bl_z_score\"].values\n",
+    "watermark_labels = np.ones_like(orig_watermark_z_scores)\n",
+    "\n",
+    "all_scores = np.concatenate([baseline_z_scores,orig_watermark_z_scores])\n",
+    "all_labels = np.concatenate([baseline_labels,watermark_labels])\n",
+    "\n",
+    "fpr, tpr, thresholds = metrics.roc_curve(all_labels, all_scores, pos_label=1)\n",
+    "roc_auc = metrics.auc(fpr, tpr)\n",
+    "\n",
+    "plt.plot(fpr, tpr, color=cmap[0], label = f'unattacked, AUC:%0.3f, PPL:{round(grouped_df[\"w_bl_ppl\"].describe().loc[group][\"mean\"],1)}' % roc_auc, linewidth=3)\n",
+    "\n",
+    "# plot different attack levels\n",
+    "for i,(group,name) in enumerate(zip(groups,names)):\n",
+    "\n",
+    "    baseline_z_scores = grouped_df.get_group(group)[\"baseline_z_score\"].values\n",
+    "    attacked_z_scores = grouped_df.get_group(group)[\"w_bl_attacked_z_score\"].values\n",
+    "    all_scores = np.concatenate([baseline_z_scores,attacked_z_scores])\n",
+    "\n",
+    "    baseline_labels = np.zeros_like(baseline_z_scores)\n",
+    "    attacked_labels = np.ones_like(attacked_z_scores)\n",
+    "    all_labels = np.concatenate([baseline_labels,attacked_labels])\n",
+    "\n",
+    "    fpr, tpr, thresholds = metrics.roc_curve(all_labels, all_scores, pos_label=1)\n",
+    "    roc_auc = metrics.auc(fpr, tpr)\n",
+    "\n",
+    "    plt.plot(fpr, tpr, color=cmap[i+1], label = f'{name}, AUC:%0.3f, PPL:{round(grouped_df[\"w_bl_attacked_ppl\"].describe().loc[group][\"mean\"],1)}' % roc_auc, linewidth=3)\n",
+    "\n",
+    "if \"w_bl_attacked_ppl\" in df.columns:\n",
+    "    pass\n",
+    "else:\n",
+    "    # # vanilla ppl value\n",
+    "    plt.scatter([-1],[-1],label=f'            $\\delta=0$, PPL: {round(grouped_df[\"no_bl_ppl\"].describe().loc[groups,\"mean\"].mean(),1)}', color=\"white\")\n",
+    "\n",
+    "zoom = False\n",
+    "# zoom = True\n",
+    "if zoom:\n",
+    "    if not \"w_bl_attacked_ppl\" in df.columns:\n",
+    "        plt.legend(loc = 'lower right')\n",
+    "    plt.xscale(\"log\")\n",
+    "    # plt.yscale(\"log\")\n",
+    "    plt.xlim([0, 1])\n",
+    "    plt.ylim([0.5, 1])\n",
+    "    if \"w_bl_attacked_ppl\" in df.columns:\n",
+    "        plot_name = \"roc_auc_untargeted_attack_no_beams_zoom\"\n",
+    "        # plot_name = \"roc_auc_untargeted_attack_with_beams_zoom\"\n",
+    "    else:\n",
+    "        plot_name = \"roc_auc_zoom\"\n",
+    "else:\n",
+    "    if \"w_bl_attacked_ppl\" in df.columns:\n",
+    "        plt.legend(loc = 'lower right',fontsize = 9)\n",
+    "    plt.plot([0, 1], [0, 1],'r--')\n",
+    "    plt.xlim([0, 1])\n",
+    "    plt.ylim([0, 1])\n",
+    "    if \"w_bl_attacked_ppl\" in df.columns:\n",
+    "        if beams: plot_name = \"roc_auc_untargeted_attack_w_beams\"\n",
+    "        if not beams: plot_name = \"roc_auc_untargeted_attack_no_beams\"\n",
+    "    else:\n",
+    "        plot_name = \"roc_auc\"\n",
+    "\n",
+    "plt.ylabel('True Positive Rate')\n",
+    "plt.xlabel('False Positive Rate')\n",
+    "\n",
+    "print(plot_name)\n",
+    "\n",
+    "# fname = f\"figs/{plot_name}.pdf\"\n",
+    "# plt.savefig(fname, format=\"pdf\")\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Z vs T (figure 3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.clf()\n",
+    "plt.figure(constrained_layout=True)\n",
+    "plt.figure(figsize=(5, 4))\n",
+    "\n",
+    "# save_fig = True\n",
+    "save_fig = False\n",
+    "\n",
+    "z_scores = True\n",
+    "# z_scores = False\n",
+    "\n",
+    "beam_search = None\n",
+    "# beam_search = 1\n",
+    "# beam_search = 4\n",
+    "# beam_search = 8\n",
+    "\n",
+    "ablate = \"delta\"\n",
+    "delta_gammas = [\n",
+    "    # (0.5,0.25),\n",
+    "    # (1.0,0.25),\n",
+    "    # (2.0,0.25),\n",
+    "    # (5.0,0.25),\n",
+    "    # (10.0,0.25),\n",
+    "    (0.5,0.5),\n",
+    "    (1.0,0.5),\n",
+    "    (2.0,0.5),\n",
+    "    (5.0,0.5),\n",
+    "    (10.0,0.5),\n",
+    "]\n",
+    "# ablate = \"gamma\"\n",
+    "# delta_gammas = [\n",
+    "#     # (5.0,0.9),\n",
+    "#     # (5.0,0.75),\n",
+    "#     # (5.0,0.5),\n",
+    "#     # (5.0,0.25),\n",
+    "#     # (5.0,0.1),\n",
+    "#     (2.0,0.9),\n",
+    "#     (2.0,0.75),\n",
+    "#     (2.0,0.5),\n",
+    "#     (2.0,0.25),\n",
+    "#     (2.0,0.1),\n",
+    "# ]\n",
+    "# if not z_scores: delta_gammas = delta_gammas[::-1]\n",
+    "\n",
+    "groups = []\n",
+    "names = []\n",
+    "\n",
+    "for d,g in delta_gammas:\n",
+    "        if beam_search:\n",
+    "            groups.append((False, beam_search, d, g))\n",
+    "        else:\n",
+    "            groups.append((True, 1, d, g))\n",
+    "        names.append(f\"$\\delta:{d},\\gamma:{g}$\")\n",
+    "\n",
+    "groups=groups[::-1]\n",
+    "names=names[::-1]\n",
+    "\n",
+    "\n",
+    "axis_max_t = 200\n",
+    "\n",
+    "max_t = None\n",
+    "# max_t = 200\n",
+    "# max_t = 100\n",
+    "# max_t = 50\n",
+    "\n",
+    "# Make colormap\n",
+    "import matplotlib.pyplot as plt\n",
+    "viridis = plt.colormaps['viridis'].resampled(len(groups)+1) \n",
+    "cmap = viridis.colors[:len(groups)][::-1]\n",
+    "\n",
+    "for grp_idx,(group, name) in enumerate(zip(groups, names)):\n",
+    "\n",
+    "    delta, gamma = group[-2],group[-1]\n",
+    "\n",
+    "    # this is the series of bools corresponding to token at T being in whitelist\n",
+    "    w_bl_hit_list = grouped_df.get_group(group)[\"w_bl_hit_list\"].to_list()\n",
+    "\n",
+    "    lengths = [len(l) for l in w_bl_hit_list]\n",
+    "    diff_lengths = set(lengths) \n",
+    "    counter = {}\n",
+    "    for l in lengths:\n",
+    "        if counter.get(l):\n",
+    "            counter[l] += 1\n",
+    "        else:\n",
+    "            counter[l] = 1\n",
+    "    if max_t:\n",
+    "        min_length = min(min(diff_lengths),max_t)\n",
+    "        max_t = min_length\n",
+    "    else:\n",
+    "        min_length = min(diff_lengths)\n",
+    "    w_bl_hit_list = [l[:min_length] for l in w_bl_hit_list]\n",
+    "\n",
+    "    # wl_hit_matrix = ~np.matrix(w_bl_hit_list)\n",
+    "    wl_hit_matrix = (~torch.tensor(w_bl_hit_list, dtype=bool)).to(torch.float)\n",
+    "    # wl_hit_matrix\n",
+    "\n",
+    "    n = wl_hit_matrix.shape[0]\n",
+    "\n",
+    "    if max_t:\n",
+    "        t_values = torch.arange(0,max_t)\n",
+    "        indices = torch.arange(0,max_t)\n",
+    "    else:\n",
+    "        t_values = torch.arange(0,wl_hit_matrix.shape[1])\n",
+    "        indices = torch.arange(0,wl_hit_matrix.shape[1])\n",
+    "    # print(t_values[:10])\n",
+    "\n",
+    "    avg_cumulative = list()\n",
+    "    std_cumulative = list()\n",
+    "    prc_25_cumulative = list()\n",
+    "    prc_50_cumulative = list()\n",
+    "    prc_75_cumulative = list()\n",
+    "\n",
+    "    prc_25_seq_indices = list()\n",
+    "\n",
+    "    for idx in indices:\n",
+    "\n",
+    "        hits_upto_t = wl_hit_matrix[:,:idx+1]\n",
+    "        cumulative_sum_to_t = hits_upto_t.sum(axis=1)\n",
+    "        wl_frac_at_t = cumulative_sum_to_t/(t_values[idx]+1)\n",
+    "        \n",
+    "        if z_scores:\n",
+    "            wl_z_score_at_t = compute_z_score(wl_frac_at_t, t_values[idx], gamma)\n",
+    "            avg_at_t = torch.mean(wl_z_score_at_t,axis=0)\n",
+    "            std_at_t = torch.std(wl_z_score_at_t,axis=0)\n",
+    "            prc_25_at_t = torch.quantile(wl_z_score_at_t,q=0.25,axis=0)\n",
+    "            prc_50_at_t = torch.quantile(wl_z_score_at_t,q=0.50,axis=0)\n",
+    "            prc_75_at_t = torch.quantile(wl_z_score_at_t,q=0.75,axis=0)\n",
+    "\n",
+    "            if gamma == 0.9: # and idx > 20 and idx < 90:\n",
+    "                pcen=np.quantile(wl_z_score_at_t,0.75,interpolation='nearest')\n",
+    "                i_near=abs(wl_z_score_at_t-pcen).argmin()\n",
+    "                # prc_25_seq_indices.append((i_near.item(),pcen))\n",
+    "                prc_25_seq_indices.append((i_near.item()))\n",
+    "        else:\n",
+    "            avg_at_t = torch.mean(wl_frac_at_t,axis=0)\n",
+    "            std_at_t = torch.std(wl_frac_at_t,axis=0)\n",
+    "            prc_25_at_t = torch.quantile(wl_frac_at_t,q=0.25,axis=0)\n",
+    "            prc_50_at_t = torch.quantile(wl_frac_at_t,q=0.50,axis=0)\n",
+    "            prc_75_at_t = torch.quantile(wl_frac_at_t,q=0.75,axis=0)\n",
+    "\n",
+    "        avg_cumulative.append(avg_at_t.item())\n",
+    "        std_cumulative.append(std_at_t.item())\n",
+    "        prc_25_cumulative.append(prc_25_at_t.item())\n",
+    "        prc_50_cumulative.append(prc_50_at_t.item())\n",
+    "        prc_75_cumulative.append(prc_75_at_t.item())\n",
+    "\n",
+    "\n",
+    "    print(prc_25_seq_indices)\n",
+    "\n",
+    "    avg_cumulative = np.array(avg_cumulative)\n",
+    "    std_cumulative = np.array(std_cumulative)\n",
+    "    std_err_cumulative = std_cumulative/np.sqrt(n)\n",
+    "    var_cumulative = std_cumulative**2\n",
+    "    \n",
+    "    plt.plot(t_values, avg_cumulative, color=cmap[grp_idx],  label=name)\n",
+    "\n",
+    "    # bounds stuff\n",
+    "\n",
+    "    # plt.plot(t_values, prc_25_cumulative, color=cmap[grp_idx], linestyle=\"dashed\") #, label=name+',25th') \n",
+    "    # # plt.plot(t_values, prc_50_cumulative, color=cmap[grp_idx], linestyle='--', label=name+',50th') \n",
+    "    # plt.plot(t_values, prc_75_cumulative, color=cmap[grp_idx], linestyle=\"dashed\") #, label=name+',75th ') \n",
+    "    # #fill between the upper and lower bands\n",
+    "    # plt.fill_between(t_values, prc_25_cumulative, prc_75_cumulative, alpha = .1,color = cmap[grp_idx])\n",
+    "    # or just lower\n",
+    "    # plt.fill_between(t_values, prc_25_cumulative, avg_cumulative, alpha = .1,color = cmap[grp_idx])\n",
+    "\n",
+    "    # plt.plot(t_values, avg_cumulative-std_cumulative, color=cmap[grp_idx], linestyle=\"dashed\") #, label=name+',25th') \n",
+    "    # plt.plot(t_values, avg_cumulative+std_cumulative, color=cmap[grp_idx], linestyle=\"dashed\") #, label=name+',25th') \n",
+    "    # plt.plot(t_values, avg_cumulative-std_err_cumulative, color=cmap[grp_idx], linestyle=\"dashed\") #, label=name+',25th') \n",
+    "    # plt.plot(t_values, avg_cumulative+std_err_cumulative, color=cmap[grp_idx], linestyle=\"dashed\") #, label=name+',25th') \n",
+    "    # plt.plot(t_values, avg_cumulative-var_cumulative, color=cmap[grp_idx], linestyle=\"dashed\") #, label=name+',25th') \n",
+    "    # plt.plot(t_values, avg_cumulative+var_cumulative, color=cmap[grp_idx], linestyle=\"dashed\") #, label=name+',25th') \n",
+    "    # fill between the upper and lower bands\n",
+    "    # plt.fill_between(t_values, avg_cumulative-std_cumulative, avg_cumulative+std_cumulative, alpha = .1,color = cmap[grp_idx])\n",
+    "    # plt.fill_between(t_values, avg_cumulative-std_err_cumulative, avg_cumulative+std_err_cumulative, alpha = .1,color = cmap[grp_idx])\n",
+    "    # or just lower\n",
+    "    # plt.fill_between(t_values, avg_cumulative-std_cumulative, avg_cumulative, alpha = .1,color = cmap[grp_idx])\n",
+    "    # plt.fill_between(t_values, avg_cumulative-std_err_cumulative, avg_cumulative, alpha = .1,color = cmap[grp_idx])\n",
+    "\n",
+    "# plt.plot([0.0],[0.0],label=f'25th Percentile', linestyle=\"dashed\", color=\"gray\")\n",
+    "\n",
+    "# if beam_search:\n",
+    "#     plt.title(f\"Greedy, {beam_search}-way BS\")\n",
+    "\n",
+    "legend_font = 11\n",
+    "\n",
+    "# zoom_midrange = True\n",
+    "# zoom = True\n",
+    "\n",
+    "zoom = False\n",
+    "\n",
+    "if zoom:\n",
+    "    if z_scores:\n",
+    "        plt.legend(loc = 'upper left', fontsize=legend_font)\n",
+    "    else:\n",
+    "        plt.legend(loc = 'lower right', fontsize=legend_font)\n",
+    "    if zoom_midrange:\n",
+    "        plt.xlim([(min_length)/4, (3*(max_t if max_t else min_length)/4)+1])\n",
+    "    else:\n",
+    "        plt.xlim([0, ((max_t if max_t else min_length)/4)+1])\n",
+    "    plot_name = f\"z_vs_t_zoom_ablate_{ablate}\" if z_scores else f\"wl_vs_t_zoom_ablate_{ablate}\"\n",
+    "else:\n",
+    "    if z_scores:\n",
+    "        plt.legend(loc = 'upper left', fontsize=legend_font)\n",
+    "    else:\n",
+    "        plt.legend(loc = 'lower right', fontsize=legend_font)\n",
+    "  \n",
+    "    plt.xlim([0, ((max_t if max_t else min_length))+1])\n",
+    "\n",
+    "    plot_name = f\"z_vs_t_ablate_{ablate}\" if z_scores else f\"wl_vs_t_ablate_{ablate}\"\n",
+    "\n",
+    "axes_label_fonts = 14\n",
+    "if z_scores:\n",
+    "    plt.ylabel('z-score',fontsize=axes_label_fonts)\n",
+    "else:\n",
+    "    plt.ylabel('Whitelist Fraction',fontsize=axes_label_fonts)\n",
+    "plt.xlabel('T',fontsize=axes_label_fonts)\n",
+    "\n",
+    "# import matplotlib.ticker as ticker\n",
+    "# tick_spacing = 5.0\n",
+    "# plt.gca().yaxis.set_major_locator(ticker.MultipleLocator(tick_spacing))\n",
+    "\n",
+    "axes_tick_font = 13\n",
+    "plt.xticks(fontsize=axes_tick_font)\n",
+    "plt.yticks(fontsize=axes_tick_font)\n",
+    "\n",
+    "plt.grid()\n",
+    "plt.tight_layout()\n",
+    "\n",
+    "if beam_search:\n",
+    "    if ablate == \"gamma\":\n",
+    "        plot_name = f\"greedy_{beam_search}_beams_delta_{delta}\" \n",
+    "    if ablate == \"delta\":\n",
+    "        plot_name = f\"greedy_{beam_search}_beams_gamma_{gamma}\" \n",
+    "\n",
+    "# plot_name = \"z_vs_t_ablate_gamma_boosted_delta\"\n",
+    "# plot_name = \"z_vs_t_ablate_delta_boosted_gamma\"\n",
+    "\n",
+    "print(plot_name)\n",
+    "\n",
+    "\n",
+    "if save_fig:\n",
+    "    # fname = f\"figs/{plot_name}.pdf\"\n",
+    "    fname = f\"figs_new/{plot_name}.pdf\"\n",
+    "    plt.savefig(fname, format=\"pdf\")\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Set up data for charts (setup for figures 2&7)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "viz_df = pd.DataFrame()\n",
+    "\n",
+    "# aggregating\n",
+    "\n",
+    "# set the hparam keys, including an indiv column for each you want to ablate on\n",
+    "viz_df[\"bl_hparams\"] = grouped_df[\"w_bl_exp_whitelist_fraction\"].describe().index.to_list()\n",
+    "for i,key in enumerate(groupby_fields):\n",
+    "    viz_df[key] = viz_df[\"bl_hparams\"].apply(lambda tup: tup[i])\n",
+    "\n",
+    "# viz_df[\"delta\"] = viz_df[\"bl_logit_bias\"].values\n",
+    "viz_df[\"gamma\"] = viz_df[\"gamma\"].values\n",
+    "# viz_df[\"gamma\"] = np.ones_like(viz_df[\"bl_proportion\"].values) - viz_df[\"bl_proportion\"].values\n",
+    "\n",
+    "# aggregate each field of interest for each hparam setting (group)\n",
+    "describe_dict = grouped_df[\"w_bl_exp_whitelist_fraction\"].describe()\n",
+    "viz_df[\"w_bl_exp_whitelist_fraction_mean\"] = describe_dict[\"mean\"].to_list()\n",
+    "viz_df[\"w_bl_exp_whitelist_fraction_std\"] = describe_dict[\"std\"].to_list()\n",
+    "\n",
+    "describe_dict = grouped_df[\"w_bl_var_whitelist_fraction\"].describe()\n",
+    "viz_df[\"w_bl_var_whitelist_fraction_mean\"] = describe_dict[\"mean\"].to_list()\n",
+    "viz_df[\"w_bl_var_whitelist_fraction_std\"] = describe_dict[\"std\"].to_list()\n",
+    "\n",
+    "describe_dict = grouped_df[\"w_bl_whitelist_fraction\"].describe()\n",
+    "viz_df[\"w_bl_whitelist_fraction_min\"] = describe_dict[\"min\"].to_list()\n",
+    "viz_df[\"w_bl_whitelist_fraction_25\"] = describe_dict[\"25%\"].to_list()\n",
+    "viz_df[\"w_bl_whitelist_fraction_50\"] = describe_dict[\"50%\"].to_list()\n",
+    "viz_df[\"w_bl_whitelist_fraction_75\"] = describe_dict[\"75%\"].to_list()\n",
+    "viz_df[\"w_bl_whitelist_fraction_max\"] = describe_dict[\"max\"].to_list()\n",
+    "viz_df[\"w_bl_whitelist_fraction_mean\"] = describe_dict[\"mean\"].to_list()\n",
+    "viz_df[\"w_bl_whitelist_fraction_std\"] = describe_dict[\"std\"].to_list()\n",
+    "\n",
+    "describe_dict = grouped_df[\"no_bl_whitelist_fraction\"].describe()\n",
+    "viz_df[\"no_bl_whitelist_fraction_mean\"] = describe_dict[\"mean\"].to_list()\n",
+    "viz_df[\"no_bl_whitelist_fraction_std\"] = describe_dict[\"std\"].to_list()\n",
+    "\n",
+    "\n",
+    "describe_dict = grouped_df[\"w_bl_z_score\"].describe()\n",
+    "viz_df[\"w_bl_z_score_mean\"] = describe_dict[\"mean\"].to_list()\n",
+    "viz_df[\"w_bl_z_score_std\"] = describe_dict[\"std\"].to_list()\n",
+    "\n",
+    "describe_dict = grouped_df[\"no_bl_z_score\"].describe()\n",
+    "viz_df[\"no_bl_z_score_mean\"] = describe_dict[\"mean\"].to_list()\n",
+    "viz_df[\"no_bl_z_score_std\"] = describe_dict[\"std\"].to_list()\n",
+    "\n",
+    "describe_dict = grouped_df[\"baseline_z_score\"].describe()\n",
+    "viz_df[\"baseline_z_score_mean\"] = describe_dict[\"mean\"].to_list()\n",
+    "viz_df[\"baseline_z_score_std\"] = describe_dict[\"std\"].to_list()\n",
+    "\n",
+    "\n",
+    "describe_dict = grouped_df[\"w_bl_ppl\"].describe()\n",
+    "viz_df[\"w_bl_ppl_mean\"] = describe_dict[\"mean\"].to_list()\n",
+    "viz_df[\"w_bl_ppl_std\"] = describe_dict[\"std\"].to_list()\n",
+    "\n",
+    "describe_dict = grouped_df[\"no_bl_ppl\"].describe()\n",
+    "viz_df[\"no_bl_ppl_mean\"] = describe_dict[\"mean\"].to_list()\n",
+    "viz_df[\"no_bl_ppl_std\"] = describe_dict[\"std\"].to_list()\n",
+    "\n",
+    "describe_dict = grouped_df[\"baseline_ppl\"].describe()\n",
+    "viz_df[\"baseline_ppl_mean\"] = describe_dict[\"mean\"].to_list()\n",
+    "viz_df[\"baseline_ppl_std\"] = describe_dict[\"std\"].to_list()\n",
+    "\n",
+    "describe_dict = grouped_df[\"avg_spike_entropy\"].describe()\n",
+    "viz_df[\"avg_spike_entropy_mean\"] = describe_dict[\"mean\"].to_list()\n",
+    "viz_df[\"avg_spike_entropy_std\"] = describe_dict[\"std\"].to_list()\n",
+    "\n",
+    "print(f\"groupby legend: {groupby_fields}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# filtering\n",
+    "\n",
+    "viz_df = viz_df[viz_df[\"bl_hparams\"].apply(lambda tup: (tup[0] == True))] # sampling\n",
+    "\n",
+    "# viz_df = viz_df[viz_df[\"bl_hparams\"].apply(lambda tup: (tup[0] == False))] # greedy\n",
+    "\n",
+    "\n",
+    "# fix one of the bl params for analytic chart\n",
+    "# viz_df = viz_df[(viz_df[\"gamma\"]==0.9) & (viz_df[\"delta\"]<=10.0)]\n",
+    "# viz_df = viz_df[(viz_df[\"gamma\"]==0.75) & (viz_df[\"delta\"]<=10.0)]\n",
+    "# viz_df = viz_df[(viz_df[\"gamma\"]==0.5) & (viz_df[\"delta\"]<=10.0)]\n",
+    "# viz_df = viz_df[(viz_df[\"gamma\"]==0.25) & (viz_df[\"delta\"]<=10.0)]\n",
+    "# viz_df = viz_df[(viz_df[\"gamma\"]==0.1) & (viz_df[\"delta\"]<=10.0)]\n",
+    "\n",
+    "# for the sample pareto chart\n",
+    "viz_df = viz_df[(viz_df[\"delta\"] > 0.5) & (viz_df[\"delta\"]<=10.0)]\n",
+    "# viz_df = viz_df[(viz_df[\"delta\"]<=2.0)] # zoom in on lower deltas\n",
+    "# viz_df = viz_df[(viz_df[\"delta\"] >= 2.0) & (viz_df[\"delta\"]<=10.0)] # mid deltas\n",
+    "# viz_df = viz_df[(viz_df[\"gamma\"] != 0.25) & (viz_df[\"gamma\"] != 0.75) & (viz_df[\"delta\"]<=2.0)]\n",
+    "# viz_df = viz_df[(viz_df[\"gamma\"] != 0.1) & (viz_df[\"gamma\"] != 0.9) & (viz_df[\"delta\"]<=2.0)]\n",
+    "\n",
+    "# viz_df = viz_df[(viz_df[\"delta\"]==0.5) | (viz_df[\"delta\"]==2.0) | (viz_df[\"delta\"]==10.0)]\n",
+    "\n",
+    "# viz_df = viz_df[(viz_df[\"delta\"]!=0.1)&(viz_df[\"delta\"]!=0.5)&(viz_df[\"delta\"]!=50.0)]\n",
+    "\n",
+    "# for the beams pareto\n",
+    "# viz_df = viz_df[(viz_df[\"delta\"]!=50.0)]\n",
+    "# viz_df = viz_df[(viz_df[\"delta\"]!=50.0) & (viz_df[\"num_beams\"]!=1)]\n",
+    "\n",
+    "print(len(viz_df))\n",
+    "\n",
+    "viz_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# grouped_df[\"avg_spike_entropy\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# viz_df[[\"gamma\",\"avg_spike_entropy_mean\"]]"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Basic Exp vs Empirical WL fraction chart (figure 7)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# plt.style.use(\"classic\")\n",
+    "plt.style.use(\"default\")\n",
+    "# plt.style.use('ggplot') \n",
+    "# plt.style.use('seaborn')\n",
+    "\n",
+    "rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})\n",
+    "rc('text', usetex=True)\n",
+    "\n",
+    "\n",
+    "plt.clf()\n",
+    "# plt.figure(figsize=(16, 4))\n",
+    "# plt.figure(figsize=(8, 4))\n",
+    "plt.figure(constrained_layout=True)\n",
+    "plt.figure(figsize=(5, 4))\n",
+    "\n",
+    "\n",
+    "# x_col = 'bl_hparams'\n",
+    "# a = viz_df[x_col].apply(str)\n",
+    "\n",
+    "# x_col = 'bl_logit_bias'\n",
+    "# x_col = 'bl_proportion'\n",
+    "x_col = \"delta\"\n",
+    "# x_col = \"gamma\"\n",
+    "\n",
+    "a = viz_df[x_col]\n",
+    "print(f\"Num configurations: {len(a)}\")\n",
+    "\n",
+    "y_col = 'w_bl_whitelist_fraction_mean'\n",
+    "y_col_err = 'w_bl_whitelist_fraction_std'\n",
+    "\n",
+    "viridis = plt.colormaps['viridis'].resampled(4)\n",
+    "# cmap = viridis.colors[::-1]\n",
+    "cmap = viridis.colors\n",
+    "\n",
+    "plt.plot(a, viz_df[\"w_bl_whitelist_fraction_mean\"].values, color=cmap[1], marker='o', label='Mean') \n",
+    "plt.plot(a, viz_df[\"w_bl_whitelist_fraction_25\"].values, color=cmap[1], linestyle='-.', label='25th Percentile') \n",
+    "plt.plot(a, viz_df[\"w_bl_whitelist_fraction_75\"].values, color=cmap[1], linestyle='-.', label='75th Percentile') \n",
+    "# plt.plot(a, viz_df[\"w_bl_whitelist_fraction_min\"].values, color=cmap[1], linestyle='-.', label='min') \n",
+    "# plt.plot(a, viz_df[\"w_bl_whitelist_fraction_max\"].values, color=cmap[1], linestyle='-.', label='max') \n",
+    "\n",
+    "#fill between the upper and lower bands\n",
+    "plt.fill_between(a, viz_df[\"w_bl_whitelist_fraction_25\"], viz_df[\"w_bl_whitelist_fraction_75\"], alpha = .1,color = cmap[1])\n",
+    "# plt.fill_between(a, viz_df[\"w_bl_whitelist_fraction_25\"], viz_df[\"w_bl_whitelist_fraction_75\"], alpha = .1,color = 'darkorchid')\n",
+    "# plt.fill_between(a, y1_low, y1_high, alpha = .1,color = 'goldenrod')\n",
+    "\n",
+    "\n",
+    "y_col = 'w_bl_exp_whitelist_fraction_mean'\n",
+    "# y_col_err = 'w_bl_var_whitelist_fraction_mean'\n",
+    "# d = viz_df[x_col].apply(str)\n",
+    "\n",
+    "# sub_df = viz_df[viz_df[\"num_beams\"]==1]\n",
+    "\n",
+    "a = viz_df[x_col]\n",
+    "e = viz_df[y_col].values\n",
+    "# plt.plot(a, e, label=\"Predicted Lower Bound\", color=cmap[-1])\n",
+    "plt.plot(a, e, label=\"Analytic Bound\", color=\"r\")\n",
+    "# f = viz_df[y_col_err].values\n",
+    "# # f = np.sqrt(viz_df[y_col_err].values)\n",
+    "# plt.errorbar(d, e, yerr=f, fmt=\"o\")\n",
+    "\n",
+    "plt.legend(loc=\"lower right\",frameon=True, facecolor=\"white\")\n",
+    "\n",
+    "# for logit bias x axis\n",
+    "# log_axis = True\n",
+    "log_axis = False\n",
+    "if log_axis:\n",
+    "    plt.xscale(\"log\")\n",
+    "\n",
+    "ax = plt.gca()\n",
+    "plt.draw()\n",
+    "\n",
+    "\n",
+    "\n",
+    "plt.xlabel(f\"Green List Bias, $\\delta$\")\n",
+    "# plt.xlabel(f\"Whitelist size := $\\gamma$\")\n",
+    "\n",
+    "plt.ylabel(\"Fraction in Green List\")\n",
+    "\n",
+    "\n",
+    "plt.grid()\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "\n",
+    "if log_axis:\n",
+    "    plot_name = \"analytic_w_sampling_log.pdf\"\n",
+    "else:\n",
+    "    plot_name = \"analytic_w_sampling_linear.pdf\"\n",
+    "    # plot_name = f\"analytic_w_sampling_linear_gamma_{viz_df['gamma'].values[0]}.pdf\"\n",
+    "\n",
+    "# plot_name = \"analytic_w_sampling_linear_greenlist.pdf\"\n",
+    "print(plot_name)\n",
+    "\n",
+    "# fname = f\"figs/{plot_name}\"\n",
+    "# plt.savefig(fname, format=\"pdf\")\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# delta gamma sampling pareto plot (figure 2 left)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})\n",
+    "rc('text', usetex=True)\n",
+    "\n",
+    "plt.clf()\n",
+    "plt.figure(constrained_layout=True)\n",
+    "plt.figure(figsize=(5, 4))\n",
+    "\n",
+    "\n",
+    "x_col = 'w_bl_ppl_mean'\n",
+    "y_col = 'w_bl_z_score_mean'\n",
+    "\n",
+    "# markers = [\"x\", \"p\", \"*\", \"P\"]\n",
+    "\n",
+    "deltas = sorted(np.unique(viz_df[\"delta\"].values))\n",
+    "gammas = sorted(np.unique(viz_df[\"gamma\"].values), reverse=True)\n",
+    "print(deltas, gammas)\n",
+    "gamma_labels = [(g if g > 0.1 else 0.1) for g in gammas]\n",
+    "\n",
+    "markers = [\"x\", \"p\", \"*\", \"P\"][:len(deltas)]\n",
+    "\n",
+    "num_colors = len(gammas)\n",
+    "cmap = cmr.get_sub_cmap('viridis', 0.0, 0.66, N=num_colors)\n",
+    "# cmap = cmr.get_sub_cmap('plasma', 0.0, 0.66, N=num_colors)\n",
+    "colors = cmap.colors#[::-1]\n",
+    "\n",
+    "\n",
+    "for i,delta in enumerate(deltas):\n",
+    "    for j,gamma in enumerate(gammas):\n",
+    "        sub_df = viz_df[(viz_df[\"delta\"] == delta) & (viz_df[\"gamma\"] == gamma)]\n",
+    "        a = sub_df[x_col].values\n",
+    "        b = sub_df[y_col].values\n",
+    "        # plt.scatter(a, b, label=f\"$\\delta={delta},\\gamma={gamma}$\", color=colors[j], marker=markers[i])\n",
+    "        plt.plot(a, b, label=f\"$\\delta={delta},\\gamma={gamma}$\", color=colors[j], marker=markers[i])\n",
+    "\n",
+    "\n",
+    "x_col = 'no_bl_ppl_mean'\n",
+    "y_col = 'no_bl_z_score_mean'\n",
+    "# x_col = 'baseline_ppl_mean'\n",
+    "# y_col = 'baseline_z_score_mean'\n",
+    "\n",
+    "\n",
+    "for i,delta in enumerate(deltas):\n",
+    "    for j,gamma in enumerate(gammas):\n",
+    "        sub_df = viz_df[(viz_df[\"delta\"] == delta) & (viz_df[\"gamma\"] == gamma)]\n",
+    "        a = sub_df[x_col].values\n",
+    "        b = sub_df[y_col].values\n",
+    "        plt.scatter(a, b, label=f\"$\\delta={delta},\\gamma={gamma}$\", color=colors[j])\n",
+    "\n",
+    "# # # for manual legend\n",
+    "plt.scatter([-1],[-1], label=\"Vanilla\", color=\"gray\", marker=\"o\")\n",
+    "\n",
+    "ax = plt.gca()\n",
+    "\n",
+    "from matplotlib.cm import ScalarMappable\n",
+    "from matplotlib.colors import Normalize, NoNorm, ListedColormap\n",
+    "cmap = ListedColormap(colors)\n",
+    "cmappable = ScalarMappable(norm=NoNorm(),cmap=cmap)\n",
+    "cbar = plt.colorbar(cmappable,ticks=[i for i in range(len(gammas))],shrink=0.6, pad = 0.03)\n",
+    "cbar.ax.set_yticklabels(gamma_labels) \n",
+    "cbar.set_label('$\\gamma$', rotation=0)\n",
+    "\n",
+    "\n",
+    "all_x = np.concatenate([viz_df['w_bl_ppl_mean'].values,viz_df['no_bl_ppl_mean'].values])\n",
+    "all_y = np.concatenate([viz_df['w_bl_z_score_mean'].values,viz_df['no_bl_z_score_mean'].values])\n",
+    "# all_x = np.concatenate([viz_df['w_bl_ppl_mean'].values,viz_df['baseline_ppl_mean'].values])\n",
+    "# all_y = np.concatenate([viz_df['w_bl_z_score_mean'].values,viz_df['baseline_z_score_mean'].values])\n",
+    "\n",
+    "min_x, max_x = np.min(all_x), np.max(all_x)\n",
+    "min_y, max_y = np.min(all_y), np.max(all_y)\n",
+    "\n",
+    "# x_min_tick = 1.0\n",
+    "x_min_tick = 3.0\n",
+    "x_max_tick = np.ceil([max_x])[0]+1.0\n",
+    "y_min_tick = 0.0\n",
+    "y_max_tick = np.ceil([max_y])[0]+1.0\n",
+    "\n",
+    "x_ticks = np.arange(x_min_tick,x_max_tick,1.0)\n",
+    "y_ticks = np.arange(y_min_tick,y_max_tick,5.0)\n",
+    "\n",
+    "\n",
+    "x_lim_min = 3.0\n",
+    "x_lim_max = x_max_tick\n",
+    "y_lim_min = 0.45\n",
+    "# y_lim_max = 1.09\n",
+    "y_lim_max = 1.005\n",
+    "\n",
+    "\n",
+    "# plt.xlim((x_min_tick-0.5,x_max_tick))\n",
+    "plt.xlim((x_lim_min,x_lim_max))\n",
+    "# plt.xlim((4.0,8.0))\n",
+    "# plt.ylim((-1.0,20.0))\n",
+    "# plt.ylim((y_lim_min,y_lim_max))\n",
+    "\n",
+    "ax.set_xticks(x_ticks)\n",
+    "# ax.set_yticks(y_ticks)\n",
+    "\n",
+    "ax.invert_xaxis()\n",
+    "\n",
+    "# # manual legend for dual parameter visualization\n",
+    "f = lambda m,c: plt.plot([],[],marker=m, color=c, ls=\"none\")[0]\n",
+    "handles = [f(markers[::-1][i], \"gray\") for i in range(len(deltas))]\n",
+    "handles += [f(\"o\", \"gray\")]\n",
+    "labels = [f\"$\\delta={delta}$\" for delta in deltas[::-1]]+[f\"$\\delta=0.0$\"]\n",
+    "plt.legend(handles, labels, loc=\"upper right\", framealpha=1)\n",
+    "\n",
+    "plt.grid()\n",
+    "\n",
+    "plt.xlabel(\"Oracle Model PPL (better →)\")\n",
+    "plt.ylabel(\"z-score (better →)\")\n",
+    "\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "\n",
+    "# plot_name = \"pareto_sampling_no_beams\"\n",
+    "# fname = f\"figs/{plot_name}.pdf\"\n",
+    "# plt.savefig(fname, format=\"pdf\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# beams pareto plot (figure 2 right)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_colors = 3\n",
+    "cmap = cmr.get_sub_cmap('viridis', 0.0, 0.66, N=num_colors)\n",
+    "colors = cmap.colors#[::-1]\n",
+    "\n",
+    "# plt.style.use('ggplot')\n",
+    "# plt.style.use('seaborn')\n",
+    "\n",
+    "rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})\n",
+    "rc('text', usetex=True)\n",
+    "\n",
+    "plt.clf()\n",
+    "plt.figure(constrained_layout=True)\n",
+    "plt.figure(figsize=(5, 4))\n",
+    "\n",
+    "\n",
+    "x_col = 'w_bl_ppl_mean'\n",
+    "y_col = 'w_bl_z_score_mean'\n",
+    "\n",
+    "markers = [\"s\",\"D\", \"x\", \"p\",  \"*\", \"P\"] # <--- seems to match other pareto fig ordering\n",
+    "\n",
+    "deltas = sorted(np.unique(viz_df[\"delta\"].values))\n",
+    "num_beams = sorted(np.unique(viz_df[\"num_beams\"].values))\n",
+    "# gamma_labels = [(g if g > 0.1 else 0.1) for g in np.unique(viz_df[\"gamma\"].values)]\n",
+    "\n",
+    "for i,n_beams in enumerate(num_beams):\n",
+    "    for j,delta in enumerate(deltas):\n",
+    "        sub_df = viz_df[(viz_df[\"delta\"] == delta) & (viz_df[\"num_beams\"] == n_beams)]\n",
+    "        a = sub_df[x_col].values\n",
+    "        b = sub_df[y_col].values\n",
+    "        # plt.scatter(a, b, label=f\"$\\delta={delta},\\gamma={gamma}$\", color=colors[j], marker=markers[i])\n",
+    "        plt.plot(a, b, label=f\"$\\delta={delta}$\", color=colors[i], marker=markers[j])\n",
+    "\n",
+    "\n",
+    "x_col = 'no_bl_ppl_mean'\n",
+    "y_col = 'no_bl_z_score_mean'\n",
+    "\n",
+    "\n",
+    "\n",
+    "for i,n_beams in enumerate(num_beams):\n",
+    "    for j,delta in enumerate(deltas):\n",
+    "        sub_df = viz_df[(viz_df[\"delta\"] == delta) & (viz_df[\"num_beams\"] == n_beams)]\n",
+    "        a = sub_df[x_col].values\n",
+    "        b = sub_df[y_col].values\n",
+    "        plt.scatter(a, b, label=f\"$\\delta={delta}$\", color=colors[i])\n",
+    "\n",
+    "# # # for manual legend\n",
+    "plt.scatter([-10],[-10], label=\"$\\delta=0$\", color=\"gray\", marker=\"o\")\n",
+    "\n",
+    "ax = plt.gca()\n",
+    "\n",
+    "from matplotlib.cm import ScalarMappable\n",
+    "from matplotlib.colors import Normalize, NoNorm, ListedColormap\n",
+    "cmap = ListedColormap(colors)\n",
+    "cmappable = ScalarMappable(norm=NoNorm(),cmap=cmap)\n",
+    "cbar = plt.colorbar(cmappable,ticks=[i for i in range(len(num_beams))],shrink=0.6, pad = 0.04)\n",
+    "# cbar.set_ticks(num_beams)\n",
+    "cbar.set_ticklabels(num_beams)\n",
+    "# cbar.ax.set_yticklabels(num_beams) \n",
+    "cbar.set_label('Num Beams', rotation=90)\n",
+    "\n",
+    "\n",
+    "all_x = np.concatenate([viz_df['w_bl_ppl_mean'].values,viz_df['no_bl_ppl_mean'].values])\n",
+    "all_y = np.concatenate([viz_df['w_bl_z_score_mean'].values,viz_df['no_bl_z_score_mean'].values])\n",
+    "\n",
+    "min_x, max_x = np.min(all_x), np.max(all_x)\n",
+    "min_y, max_y = np.min(all_y), np.max(all_y)\n",
+    "\n",
+    "# x_max_tick = np.ceil([max_x])[0]+1.0\n",
+    "x_max_tick = np.ceil([max_x])[0]\n",
+    "y_max_tick = np.ceil([max_y])[0]+1.0\n",
+    "\n",
+    "\n",
+    "plt.xlim((1.0,x_max_tick))\n",
+    "plt.ylim((-1.0,y_max_tick))\n",
+    "\n",
+    "# x_ticks = np.arange(x_min_tick,x_max_tick,1.0)\n",
+    "# y_ticks = np.arange(y_min_tick,y_max_tick,5.0)\n",
+    "\n",
+    "# ax.set_xticks(x_ticks)\n",
+    "# ax.set_yticks(y_ticks)\n",
+    "\n",
+    "ax.invert_xaxis()\n",
+    "\n",
+    "# # manual legend for dual parameter visualization\n",
+    "f = lambda m,c: plt.plot([],[],marker=m, color=c, ls=\"none\")[0]\n",
+    "handles = [f(markers[::-1][i], \"gray\") for i in range(len(deltas))]\n",
+    "handles += [f(\"o\", \"gray\")]\n",
+    "labels = [f\"$\\delta={delta}$\" for delta in deltas[::-1]]+[f\"$\\delta=0.0$\"]\n",
+    "plt.legend(handles, labels, loc=\"lower left\", framealpha=1)\n",
+    "\n",
+    "plt.grid()\n",
+    "\n",
+    "plt.xlabel(\"Oracle Model PPL (better →)\")\n",
+    "plt.ylabel(\"z-score (better →)\")\n",
+    "\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "\n",
+    "\n",
+    "plot_name = \"pareto_greedy_w_beams\"\n",
+    "print(plot_name)\n",
+    "\n",
+    "# fname = f\"figs/{plot_name}.pdf\"\n",
+    "# plt.savefig(fname, format=\"pdf\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## z vs entropy (not in paper)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"groupby legend: {groupby_fields}\")\n",
+    "# hist_subset = grouped_df.get_group((True,1,2.0,0.1)) # needs to match the groupby keys and order\n",
+    "# hist_subset = grouped_df.get_group((True,1,2.0,0.25)) \n",
+    "hist_subset = grouped_df.get_group((True,1,2.0,0.5)) \n",
+    "# hist_subset = grouped_df.get_group((True,1,2.0,0.75)) \n",
+    "# hist_subset = grouped_df.get_group((True,1,2.0,0.9)) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(len(hist_subset))\n",
+    "# hist_subset = hist_subset[hist_subset[\"w_bl_space_frac\"] <= 0.9]\n",
+    "# hist_subset = hist_subset[hist_subset[\"no_bl_space_frac\"] <= 0.9]\n",
+    "# print(len(hist_subset))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# y = hist_subset[\"w_bl_z_score\"]\n",
+    "# y = hist_subset[\"no_bl_z_score\"]\n",
+    "y = hist_subset[\"baseline_z_score\"]\n",
+    "\n",
+    "x = hist_subset[\"avg_spike_entropy\"]\n",
+    "\n",
+    "plt.clf()\n",
+    "\n",
+    "\n",
+    "plt.scatter(x, y)\n",
+    "\n",
+    "\n",
+    "plt.grid()\n",
+    "\n",
+    "plt.xlabel(\"Entropy\")\n",
+    "plt.ylabel(\"z-score\")\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cols_to_tabulate = [\n",
+    "    'idx', \n",
+    "    'truncated_input', \n",
+    "    'baseline_completion',\n",
+    "    'no_bl_output', \n",
+    "    'w_bl_output', \n",
+    "    'avg_spike_entropy',\n",
+    "    'no_bl_z_score',\n",
+    "    'w_bl_z_score',\n",
+    "    'w_bl_whitelist_fraction',\n",
+    "    'no_bl_whitelist_fraction',\n",
+    "    'baseline_ppl',\n",
+    "    'no_bl_ppl',\n",
+    "    'w_bl_ppl'\n",
+    "]\n",
+    "\n",
+    "slice_size = 10\n",
+    "\n",
+    "num_examples = len(hist_subset)\n",
+    "midpt = num_examples//5\n",
+    "lower = midpt - (slice_size//2)\n",
+    "upper = midpt + (slice_size//2)+1\n",
+    "\n",
+    "high_entropy_examples = hist_subset[cols_to_tabulate].sort_values([\"avg_spike_entropy\"],ascending=True).tail(slice_size)\n",
+    "mid_entropy_examples = hist_subset[cols_to_tabulate].sort_values([\"avg_spike_entropy\"],ascending=True).iloc[lower:upper]\n",
+    "low_entropy_examples = hist_subset[cols_to_tabulate].sort_values([\"avg_spike_entropy\"],ascending=True).head(slice_size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# hist_subset[cols_to_tabulate][(hist_subset[\"avg_spike_entropy\"]<0.7)&(hist_subset[\"w_bl_z_score\"]>=14.0)]\n",
+    "hist_subset[cols_to_tabulate][(hist_subset[\"avg_spike_entropy\"]<0.7)&(hist_subset[\"baseline_z_score\"]>=7.0)]\n",
+    "# hist_subset[cols_to_tabulate][(hist_subset[\"avg_spike_entropy\"]<0.7)&(hist_subset[\"w_bl_z_score\"]>=12.0)]\n",
+    "# print(hist_subset[cols_to_tabulate][(hist_subset[\"avg_spike_entropy\"]<0.7)&(hist_subset[\"w_bl_z_score\"]>=14.0)].iloc[6][\"w_bl_output\"])\n",
+    "# .to_csv(\"input/pile_low_S_high_z_outliers.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "high_entropy_examples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mid_entropy_examples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "low_entropy_examples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# plotting histograms of the metric for single runs (not in paper)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"groupby legend: {groupby_fields}\")\n",
+    "# hist_subset = grouped_df.get_group((True,1,2.0,0.1)) # needs to match the groupby keys and order\n",
+    "# hist_subset = grouped_df.get_group((True,1,2.0,0.25)) \n",
+    "hist_subset = grouped_df.get_group((True,1,2.0,0.5)) \n",
+    "# hist_subset = grouped_df.get_group((True,1,2.0,0.75)) \n",
+    "# hist_subset = grouped_df.get_group((True,1,2.0,0.9)) "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#####  old filters to smooth the histograms"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# hist_subset = hist_subset[(hist_subset[\"no_bl_num_tokens_generated\"] == hist_subset[\"max_new_tokens\"]) & (hist_subset[\"w_bl_num_tokens_generated\"] == hist_subset[\"max_new_tokens\"])]\n",
+    "# hist_subset = hist_subset[hist_subset[\"truncated_input\"] != \"\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_no_bl_wl_fractions = hist_subset[\"no_bl_whitelist_fraction\"]\n",
+    "all_w_bl_wl_fractions = hist_subset[\"w_bl_whitelist_fraction\"]\n",
+    "all_baseline_wl_fractions = hist_subset[\"baseline_whitelist_fraction\"]\n",
+    "# all_no_bl_wl_fractions = hist_subset[\"no_bl_z_score\"]\n",
+    "# all_w_bl_wl_fractions = hist_subset[\"w_bl_z_score\"]\n",
+    "# all_baseline_wl_fractions = hist_subset[\"baseline_z_score\"]\n",
+    "\n",
+    "plt.clf()\n",
+    "\n",
+    "all_vals = np.concatenate([all_baseline_wl_fractions, all_w_bl_wl_fractions, all_no_bl_wl_fractions])\n",
+    "n_bins = 50\n",
+    "bins = np.linspace(np.min(all_vals), np.max(all_vals), n_bins)\n",
+    "# bins = np.linspace(0.0, 1.0, n_bins)\n",
+    "\n",
+    "# plt.hist(all_no_bl_wl_fractions, \n",
+    "#         bins=bins,\n",
+    "#         alpha=0.6,\n",
+    "#         label='no blacklisting')\n",
+    "\n",
+    "\n",
+    "plt.hist(all_w_bl_wl_fractions, \n",
+    "        bins=bins,\n",
+    "        alpha=0.6,\n",
+    "        label='with blacklisting')\n",
+    "\n",
+    "plt.hist(all_baseline_wl_fractions,\n",
+    "        bins=bins,\n",
+    "        alpha=0.4,\n",
+    "        # label='wl')\n",
+    "        label='ground truth/real text')\n",
+    "\n",
+    "# plt.hist(all_baseline_bl_fractions, \n",
+    "#         bins=bins,\n",
+    "#         alpha=0.5,\n",
+    "#         label='bl')\n",
+    "\n",
+    "plt.legend(loc='upper right')\n",
+    "\n",
+    "# plt.xlim((-0.1,1.1))\n",
+    "# plt.xticks(np.arange(0.0,1.0,0.1))\n",
+    "plt.xlabel(\"fraction of total toks gen'd in WL\")\n",
+    "plt.ylabel(\"freq\")\n",
+    "\n",
+    "# plt.title('baseline wl/bl fractions')\n",
+    "plt.title(\"Output Whitelist Token Distribution\")\n",
+    "\n",
+    "# plot_name = \"wl_distro\"\n",
+    "# fname = f\"figs/{plot_name}.png\"\n",
+    "# plt.savefig(fname, dpi=600)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.clf()\n",
+    "\n",
+    "all_no_bl_ppls = hist_subset[\"no_bl_ppl\"]\n",
+    "all_w_bl_ppls = hist_subset[\"w_bl_ppl\"]\n",
+    "all_baseline_ppls = hist_subset[\"baseline_ppl\"]\n",
+    "\n",
+    "all_vals = list(np.concatenate([all_no_bl_ppls, all_w_bl_ppls]))\n",
+    "all_vals = sorted(all_vals)\n",
+    "n_bins = 50\n",
+    "# bins = np.linspace(all_vals[0], all_vals[-1], n_bins)\n",
+    "bins = np.linspace(all_vals[0], 20, n_bins)\n",
+    "\n",
+    "plt.hist(all_no_bl_ppls, \n",
+    "        bins=bins,\n",
+    "        alpha=0.6,\n",
+    "        label='no blacklisting')\n",
+    "\n",
+    "plt.hist(all_w_bl_ppls, \n",
+    "        bins=bins,\n",
+    "        alpha=0.6,\n",
+    "        label='with blacklisting')\n",
+    "\n",
+    "plt.legend(loc='upper right')\n",
+    "\n",
+    "# plt.xlim((0,1))\n",
+    "plt.xlabel(\"perplexity (lower is better)\")\n",
+    "plt.ylabel(\"freq\")\n",
+    "\n",
+    "plt.title('Model-based Output Quality/Fluency')\n",
+    "\n",
+    "# plot_name = \"ppl_no_baseline\"\n",
+    "# fname = f\"figs/{plot_name}.png\"\n",
+    "# plt.savefig(fname, dpi=600)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.clf()\n",
+    "\n",
+    "all_vals = list(np.concatenate([all_no_bl_ppls, all_w_bl_ppls]))\n",
+    "all_vals = sorted(all_vals)\n",
+    "n_bins = 50\n",
+    "# bins = np.linspace(all_vals[0], all_vals[-1], n_bins)\n",
+    "bins = np.linspace(all_vals[0], 20, n_bins)\n",
+    "\n",
+    "plt.hist(all_no_bl_ppls, \n",
+    "        bins=bins,\n",
+    "        alpha=0.6,\n",
+    "        label='no blacklisting')\n",
+    "\n",
+    "plt.hist(all_w_bl_ppls, \n",
+    "        bins=bins,\n",
+    "        alpha=0.6,\n",
+    "        label='with blacklisting')\n",
+    "\n",
+    "plt.hist(all_baseline_ppls, \n",
+    "        bins=bins,      \n",
+    "        alpha=0.4,\n",
+    "        label='ground truth/real text')\n",
+    "\n",
+    "plt.legend(loc='upper right')\n",
+    "\n",
+    "# plt.xlim((0,1))\n",
+    "plt.xlabel(\"perplexity (lower is better)\")\n",
+    "plt.ylabel(\"freq\")\n",
+    "\n",
+    "plt.title('Model-based Output Quality/Fluency')\n",
+    "\n",
+    "# plot_name = \"ppl_w_baseline\"\n",
+    "# fname = f\"figs/{plot_name}.png\"\n",
+    "# plt.savefig(fname, dpi=600)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "365524a309ad80022da286f2ec5d2060ce5cb229abb6076cf68d9a1ab14bd8fe"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

lm-watermarking-main/experiments/watermarking_example_finding.ipynb ADDED Viewed

	@@ -0,0 +1,1007 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Watermark Analysis\n",
+    "\n",
+    "Notebook for performing analysis and visualization of the effects of watermarking schemes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_from_disk"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load the processed dataset/frame"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_name = \"analysis_ds_1-19_realnews_1-3_v1\" # in figure\n",
+    "# save_name = \"analysis_ds_1-21_greedy_redo\" \n",
+    "# save_name = \"analysis_ds_1-21_greedy_redo_truncated\" # in figure\n",
+    "\n",
+    "# save_name = \"analysis_ds_1-20_more_attack\" # in figure\n",
+    "\n",
+    "save_dir = f\"input/{save_name}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_data = load_from_disk(save_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### convert to pandas df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = raw_data.to_pandas()\n",
+    "\n",
+    "retok_problematic_rows = df[(df['w_bl_whitelist_fraction'] != -1.0) & (df['w_bl_whitelist_fraction'] != 1.0) & (df['bl_type'] == 'hard')]\n",
+    "print(f\"Num rows that are hard-blacklisted, and measureable, but still have a non-100% WL fraction: {len(retok_problematic_rows)} out of {len(df[df['bl_type'] == 'hard'])}\")\n",
+    "\n",
+    "orig_len = len(df)\n",
+    "\n",
+    "df = df[df[\"no_bl_whitelist_fraction\"] != -1.0]\n",
+    "df = df[df[\"w_bl_whitelist_fraction\"] != -1.0]\n",
+    "\n",
+    "print(f\"Dropped {orig_len-len(df)} rows, new len {len(df)}\")\n",
+    "\n",
+    "orig_len = len(df)\n",
+    "# df = df[df[\"no_bl_ppl\"].isna()]\n",
+    "# df = df[df[\"w_bl_ppl\"].isna()]\n",
+    "df = df[~(df[\"no_bl_ppl\"].isna() | df[\"w_bl_ppl\"].isna())]\n",
+    "print(f\"Dropped {orig_len-len(df)} rows, new len {len(df)}\")\n",
+    "\n",
+    "orig_len = len(df)\n",
+    "\n",
+    "df = df[df[\"bl_logit_bias\"] <= 100.0]\n",
+    "\n",
+    "print(f\"Dropped {orig_len-len(df)} rows, new len {len(df)}\")\n",
+    "\n",
+    "\n",
+    "orig_len = len(df)\n",
+    "\n",
+    "# df = df[df[\"bl_hparams\"].apply(lambda tup: (tup[0] == False and tup[2] != 1) or (tup[0] == True and tup[2] == 1) or (tup[0] == False))]\n",
+    "df = df[((df[\"use_sampling\"]==True) & (df[\"num_beams\"] == 1)) | (df[\"use_sampling\"]==False)]\n",
+    "\n",
+    "print(f\"Dropped {orig_len-len(df)} rows, new len {len(df)}\")\n",
+    "\n",
+    "\n",
+    "df.loc[df[\"use_sampling\"]==False,\"sampling_temp\"] = df.loc[df[\"use_sampling\"]==False,\"sampling_temp\"].fillna(0.0)\n",
+    "df.loc[df[\"use_sampling\"]==True,\"sampling_temp\"] = df.loc[df[\"use_sampling\"]==True,\"sampling_temp\"].fillna(1.0)\n",
+    "\n",
+    "\n",
+    "df.loc[df[\"bl_type\"]==\"hard\",\"bl_logit_bias\"] = np.inf\n",
+    "# df.loc[df[\"bl_type\"]==\"hard\",\"bl_logit_bias\"] = 10000 # crosscheck with whats hardcoded in the bl processor\n",
+    "\n",
+    "\n",
+    "df[\"delta\"] = df[\"bl_logit_bias\"].values\n",
+    "df[\"gamma\"] = 1 - df[\"bl_proportion\"].values\n",
+    "df[\"gamma\"] = df[\"gamma\"].round(3)\n",
+    "\n",
+    "df[\"no_bl_act_num_wl_tokens\"] = np.round(df[\"no_bl_whitelist_fraction\"].values*df[\"no_bl_num_tokens_generated\"],1) # round to 1 for sanity\n",
+    "df[\"w_bl_act_num_wl_tokens\"] = np.round(df[\"w_bl_whitelist_fraction\"].values*df[\"w_bl_num_tokens_generated\"],1) # round to 1 for sanity\n",
+    "\n",
+    "df[\"w_bl_std_num_wl_tokens\"] = np.sqrt(df[\"w_bl_var_num_wl_tokens\"].values)\n",
+    "\n",
+    "if \"real_completion_length\":\n",
+    "    df[\"baseline_num_tokens_generated\"] = df[\"real_completion_length\"].values\n",
+    "\n",
+    "if \"actual_attacked_ratio\" in df.columns:\n",
+    "    df[\"actual_attacked_fraction\"] = df[\"actual_attacked_ratio\"].values*df[\"replace_ratio\"].values\n",
+    "\n",
+    "\n",
+    "\n",
+    "df[\"baseline_hit_list_length\"] = df[\"baseline_hit_list\"].apply(len)\n",
+    "df[\"no_bl_hit_list_length\"] = df[\"no_bl_hit_list\"].apply(len)\n",
+    "df[\"w_bl_hit_list_length\"] = df[\"w_bl_hit_list\"].apply(len)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Filter for the generation lengths we want to look at"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "orig_len = len(df)\n",
+    "\n",
+    "upper_T = 205\n",
+    "lower_T = 195\n",
+    "df = df[(df[\"baseline_hit_list_length\"] >= lower_T) & (df[\"no_bl_hit_list_length\"] >= lower_T) & (df[\"w_bl_hit_list_length\"] >= lower_T)] # now also applies to the truncated version\n",
+    "df = df[(df[\"baseline_hit_list_length\"] <= upper_T) & (df[\"no_bl_hit_list_length\"] <= upper_T) & (df[\"w_bl_hit_list_length\"] <= upper_T)] # now also applies to the truncated version\n",
+    "\n",
+    "\n",
+    "print(f\"Dropped {orig_len-len(df)} rows, new len {len(df)}\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Add z-scores"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from math import sqrt\n",
+    "import scipy.stats\n",
+    "def compute_z_score(observed_wl_frac, T, gamma):\n",
+    "    numer = observed_wl_frac - gamma\n",
+    "    denom = sqrt(gamma*(1-gamma)/T)\n",
+    "    z = numer/denom\n",
+    "    return z\n",
+    "\n",
+    "def compute_wl_for_z(z, T, gamma):\n",
+    "    denom = sqrt(gamma*(1-gamma)/T)\n",
+    "    numer = ((z*denom)+gamma)*T\n",
+    "    return numer\n",
+    "\n",
+    "def compute_p_value(z):\n",
+    "    p_value = scipy.stats.norm.sf(abs(z))\n",
+    "    return p_value\n",
+    "\n",
+    "df[\"baseline_z_score\"] = df[[\"baseline_whitelist_fraction\", \"baseline_num_tokens_generated\", \"gamma\"]].apply(lambda tup: compute_z_score(*tup), axis=1)\n",
+    "df[\"no_bl_z_score\"] = df[[\"no_bl_whitelist_fraction\", \"no_bl_num_tokens_generated\", \"gamma\"]].apply(lambda tup: compute_z_score(*tup), axis=1)\n",
+    "df[\"w_bl_z_score\"] = df[[\"w_bl_whitelist_fraction\", \"w_bl_num_tokens_generated\", \"gamma\"]].apply(lambda tup: compute_z_score(*tup), axis=1)\n",
+    "\n",
+    "if \"w_bl_attacked_whitelist_fraction\" in df.columns:\n",
+    "    df[\"w_bl_attacked_z_score\"] = df[[\"w_bl_attacked_whitelist_fraction\", \"w_bl_attacked_num_tokens_generated\", \"gamma\"]].apply(lambda tup: compute_z_score(*tup), axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # if attacked in df\n",
+    "if \"w_bl_attacked_whitelist_fraction\" in df.columns:\n",
+    "    df[\"w_bl_attacked_act_num_wl_tokens\"] = np.round(df[\"w_bl_attacked_whitelist_fraction\"].values*df[\"w_bl_attacked_num_tokens_generated\"],1) # round to 1 for sanity\n",
+    "\n",
+    "    df[\"w_bl_attacked_z_score\"] = df[[\"w_bl_attacked_whitelist_fraction\", \"w_bl_attacked_num_tokens_generated\", \"gamma\"]].apply(lambda tup: compute_z_score(*tup), axis=1)\n",
+    "\n",
+    "    df[[\"bl_proportion\",\"w_bl_attacked_whitelist_fraction\", \"w_bl_attacked_num_tokens_generated\",\"w_bl_attacked_act_num_wl_tokens\", \"w_bl_attacked_z_score\"]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Groupby"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if \"w_bl_attacked_whitelist_fraction\" in df.columns: \n",
+    "    groupby_fields = ['use_sampling','num_beams','gamma','delta', 'replace_ratio'] # attack grouping\n",
+    "else:\n",
+    "    groupby_fields = ['use_sampling','num_beams','delta','gamma'] # regular grouping\n",
+    "    # groupby_fields = ['use_sampling','delta','gamma'] # regular grouping, but no beam variation\n",
+    "    # groupby_fields = ['delta','gamma'] # regular grouping, but no beam variation, and all sampling"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Main groupby"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "grouped_df = df.groupby(groupby_fields)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"Number of rows after filtering: {len(df)}\")\n",
+    "print(f\"Number of groups: {len(grouped_df)}\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Loop to compute confusion matrix at some z scores for tabulation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sklearn.metrics as metrics\n",
+    "\n",
+    "def reject_null_hypo(z_score=None,cuttoff=None):\n",
+    "    return z_score > cuttoff\n",
+    "\n",
+    "records = []\n",
+    "\n",
+    "for group_params in tqdm(list(grouped_df.groups.keys())):\n",
+    "    sub_df = grouped_df.get_group(group_params)\n",
+    "    grp_size = len(sub_df)\n",
+    "\n",
+    "    # baseline_z_scores = sub_df[\"baseline_z_score\"].values\n",
+    "    # w_bl_z_scores = sub_df[\"w_bl_z_score\"].values\n",
+    "    # all_scores = np.concatenate([baseline_z_scores,w_bl_z_scores])\n",
+    "\n",
+    "    # baseline_labels = np.zeros_like(baseline_z_scores)\n",
+    "    # attacked_labels = np.ones_like(w_bl_z_scores)\n",
+    "    # all_labels = np.concatenate([baseline_labels,attacked_labels])\n",
+    "\n",
+    "    # fpr, tpr, thresholds = metrics.roc_curve(all_labels, all_scores, pos_label=1)\n",
+    "    # roc_auc = metrics.auc(fpr, tpr)\n",
+    "    record = {k:v for k,v in zip(groupby_fields,group_params)}\n",
+    "\n",
+    "    for thresh in [4.0,5.0]:\n",
+    "        \n",
+    "        record[\"count\"] = grp_size\n",
+    "        record[f\"baseline_fpr_at_{thresh}\"] = reject_null_hypo(z_score=sub_df[\"baseline_z_score\"].values,cuttoff=thresh).sum() / grp_size\n",
+    "        record[f\"baseline_tnr_at_{thresh}\"] = (~reject_null_hypo(z_score=sub_df[\"baseline_z_score\"],cuttoff=thresh)).sum() / grp_size\n",
+    "        record[f\"no_bl_fpr_at_{thresh}\"] = reject_null_hypo(z_score=sub_df[\"no_bl_z_score\"].values,cuttoff=thresh).sum() / grp_size\n",
+    "        record[f\"no_bl_tnr_at_{thresh}\"] = (~reject_null_hypo(z_score=sub_df[\"no_bl_z_score\"].values,cuttoff=thresh)).sum() / grp_size\n",
+    "        record[f\"w_bl_tpr_at_{thresh}\"] = reject_null_hypo(z_score=sub_df[\"w_bl_z_score\"].values,cuttoff=thresh).sum() / grp_size\n",
+    "        record[f\"w_bl_fnr_at_{thresh}\"] = (~reject_null_hypo(z_score=sub_df[\"w_bl_z_score\"].values,cuttoff=thresh)).sum() / grp_size\n",
+    "\n",
+    "        if \"w_bl_attacked_z_score\" in sub_df.columns:\n",
+    "            record[f\"w_bl_attacked_tpr_at_{thresh}\"] = reject_null_hypo(z_score=sub_df[\"w_bl_attacked_z_score\"].values,cuttoff=thresh).sum() / grp_size\n",
+    "            record[f\"w_bl_attacked_fnr_at_{thresh}\"] = (~reject_null_hypo(z_score=sub_df[\"w_bl_attacked_z_score\"].values,cuttoff=thresh)).sum() / grp_size\n",
+    "\n",
+    "    records.append(record)\n",
+    "\n",
+    "    #     # df[f\"baseline_fp_at_{thresh}\"] = reject_null_hypo(z_score=df[\"baseline_z_score\"].values,cuttoff=thresh)\n",
+    "    #     # df[f\"baseline_tn_at_{thresh}\"] = ~reject_null_hypo(z_score=df[\"baseline_z_score\"],cuttoff=thresh)\n",
+    "    #     # df[f\"no_bl_fp_at_{thresh}\"] = reject_null_hypo(z_score=df[\"no_bl_z_score\"].values,cuttoff=thresh)\n",
+    "    #     # df[f\"no_bl_tn_at_{thresh}\"] = ~reject_null_hypo(z_score=df[\"no_bl_z_score\"].values,cuttoff=thresh)\n",
+    "    #     # df[f\"w_bl_tp_at_{thresh}\"] = reject_null_hypo(z_score=df[\"w_bl_z_score\"].values,cuttoff=thresh)\n",
+    "    #     # df[f\"w_bl_fn_at_{thresh}\"] = ~reject_null_hypo(z_score=df[\"w_bl_z_score\"].values,cuttoff=thresh)\n",
+    "\n",
+    "\n",
+    "roc_df = pd.DataFrame.from_records(records)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# thresh = 6.0\n",
+    "# thresh = 5.0\n",
+    "std_threshes = [4.0, 5.0] #, 6.0]\n",
+    "# std_threshes = [4.0]\n",
+    "\n",
+    "# roc_df[\"params\"] = roc_df.index.to_list()\n",
+    "\n",
+    "columns = [\"delta\", \"gamma\", \"count\"]\n",
+    "# columns = [\"use_sampling\", \"replace_ratio\", \"count\"]\n",
+    "\n",
+    "for thresh in std_threshes:\n",
+    "    # columns += [f\"baseline_fpr_at_{thresh}\",f\"no_bl_fpr_at_{thresh}\",f\"w_bl_tpr_at_{thresh}\"]\n",
+    "    # columns += [f\"baseline_fpr_at_{thresh}\",f\"baseline_tnr_at_{thresh}\",f\"no_bl_fpr_at_{thresh}\",f\"no_bl_tnr_at_{thresh}\",f\"w_bl_tpr_at_{thresh}\",f\"w_bl_fn_at_{thresh}\"]\n",
+    "\n",
+    "\n",
+    "    # columns += [f\"baseline_fpr_at_{thresh}\",f\"baseline_tnr_at_{thresh}\",f\"w_bl_tpr_at_{thresh}\",f\"w_bl_fnr_at_{thresh}\"]\n",
+    "    \n",
+    "    if f\"w_bl_attacked_fnr_at_{thresh}\" in roc_df.columns:\n",
+    "        columns += [f\"w_bl_tpr_at_{thresh}\",f\"w_bl_fnr_at_{thresh}\"]\n",
+    "        columns += [f\"w_bl_attacked_tpr_at_{thresh}\",f\"w_bl_attacked_fnr_at_{thresh}\"] # if attack\n",
+    "    else:\n",
+    "        columns += [f\"baseline_fpr_at_{thresh}\",f\"baseline_tnr_at_{thresh}\",f\"w_bl_tpr_at_{thresh}\",f\"w_bl_fnr_at_{thresh}\"]\n",
+    "\n",
+    "# filter ot not\n",
+    "sub_df = roc_df[(roc_df[\"use_sampling\"] == True) & ((roc_df[\"delta\"] == 1.0) | (roc_df[\"delta\"] == 2.0) | (roc_df[\"delta\"] == 10.0))  &  ((roc_df[\"gamma\"] == 0.1) | (roc_df[\"gamma\"] == 0.25) |(roc_df[\"gamma\"] == 0.5) )]\n",
+    "# sub_df = roc_df[(roc_df[\"replace_ratio\"] == 0.1) | (roc_df[\"replace_ratio\"] == 0.3) | (roc_df[\"replace_ratio\"] == 0.5)  | (roc_df[\"replace_ratio\"] == 0.7)]\n",
+    "# sub_df = roc_df\n",
+    "\n",
+    "sub_df.sort_values(\"delta\")[columns]\n",
+    "# sub_df.sort_values(\"num_beams\")[columns]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print(roc_df[columns].drop([\"count\"],axis=1).sort_values(\"gamma\").round(3).to_latex(index=False))\n",
+    "# print(roc_df[columns].drop([\"count\"],axis=1).sort_values(\"delta\").round(3).to_latex(index=False))\n",
+    "# print(roc_df[columns].drop([\"count\"],axis=1).sort_values(\"num_beams\").round(3).to_latex(index=False))\n",
+    "\n",
+    "print(sub_df.sort_values(\"delta\")[columns].round(3).to_latex(index=False))\n",
+    "# print(sub_df.sort_values(\"num_beams\")[columns].round(3).to_latex(index=False))"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### write to csv maybe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# cols_to_drop = ['no_bl_gen_time',\n",
+    "#     'w_bl_gen_time', 'spike_entropies', \n",
+    "#     'no_bl_sec_per_tok', 'no_bl_tok_per_sec', 'w_bl_sec_per_tok',\n",
+    "#     'w_bl_tok_per_sec', 'baseline_loss','no_bl_loss',\n",
+    "#     'w_bl_loss',  'model_name', 'dataset_name',\n",
+    "#     'dataset_config_name', 'shuffle_dataset', 'shuffle_seed',\n",
+    "#     'shuffle_buffer_size', 'max_new_tokens', 'min_prompt_tokens',\n",
+    "#     'limit_indices', 'input_truncation_strategy',\n",
+    "#     'input_filtering_strategy', 'output_filtering_strategy', 'initial_seed',\n",
+    "#     'dynamic_seed','no_repeat_ngram_size', 'early_stopping',\n",
+    "#     'oracle_model_name', 'no_wandb', 'wandb_project', 'wandb_entity', 'output_dir', 'load_prev_generations', 'store_bl_ids',\n",
+    "#     'store_spike_ents',  'generate_only',\n",
+    "#     'SLURM_JOB_ID', 'SLURM_ARRAY_JOB_ID', 'SLURM_ARRAY_TASK_ID',\n",
+    "#     'gen_table_already_existed', 'baseline_num_toks_gend_eq_0',\n",
+    "#     'baseline_hit_list', 'no_bl_num_toks_gend_eq_0',\n",
+    "#     'no_bl_hit_list', 'w_bl_num_toks_gend_eq_0', 'w_bl_hit_list']\n",
+    "# df.drop(cols_to_drop,axis=1).to_csv(\"input/for_poking.csv\")\n",
+    "# df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Extract examples (actual text) for tabulation based on entropy and z scores (tables 1,3,4,5,6)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"groupby legend: {groupby_fields}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "groups = [\n",
+    "    (True, 1, 2.0, 0.5),\n",
+    "    # (True, 1, 10.0, 0.5),\n",
+    "    # (False, 8, 2.0, 0.5),\n",
+    "    # (False, 8, 10.0, 0.5),\n",
+    "]\n",
+    "group_dfs = []\n",
+    "for group in groups:\n",
+    "    sub_df = grouped_df.get_group(group)\n",
+    "    group_dfs.append(sub_df)\n",
+    "\n",
+    "subset_df = pd.concat(group_dfs,axis=0)\n",
+    "\n",
+    "print(len(subset_df))\n",
+    "# subset_df\n",
+    "\n",
+    "# cols_to_tabulate = groupby_fields + [\n",
+    "cols_to_tabulate = [\n",
+    "    'idx', \n",
+    "    'truncated_input', \n",
+    "    # 'prompt_length',\n",
+    "    'baseline_completion',\n",
+    "    'no_bl_output', \n",
+    "    'w_bl_output', \n",
+    "    # 'real_completion_length',\n",
+    "    # 'no_bl_num_tokens_generated',\n",
+    "    # 'w_bl_num_tokens_generated',\n",
+    "    'avg_spike_entropy',\n",
+    "    # 'baseline_whitelist_fraction',\n",
+    "    # 'no_bl_whitelist_fraction',\n",
+    "    # 'w_bl_whitelist_fraction',\n",
+    "    # 'baseline_z_score',\n",
+    "    'no_bl_z_score',\n",
+    "    'w_bl_z_score',\n",
+    "    # 'baseline_ppl',\n",
+    "    'no_bl_ppl',\n",
+    "    'w_bl_ppl'\n",
+    "]\n",
+    "\n",
+    "# subset_df[cols_to_tabulate][\"idx\"].value_counts()\n",
+    "\n",
+    "for idx,occurrences in subset_df[\"idx\"].value_counts().to_dict().items():\n",
+    "    subset_df.loc[(subset_df[\"idx\"]==idx),\"occurences\"] = occurrences\n",
+    "\n",
+    "subset_df[\"occurences\"] = subset_df[\"occurences\"].apply(int)\n",
+    "\n",
+    "# cols_to_tabulate = [\"occurences\"] + cols_to_tabulate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# subset_df[cols_to_tabulate].sort_values([\"occurences\", \"idx\"],ascending=False)\n",
+    "# subset_df[cols_to_tabulate].sort_values([\"avg_spike_entropy\"],ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_prompt_chars = 200\n",
+    "max_output_chars = 200\n",
+    "# subset_df[\"truncated_input\"] = subset_df[\"truncated_input\"].apply(lambda s: f\"[...]{s[-max_prompt_chars:]}\")\n",
+    "# subset_df[\"baseline_completion\"] = subset_df[\"baseline_completion\"].apply(lambda s: f\"{s[:max_output_chars]}[...truncated]\")\n",
+    "# subset_df[\"no_bl_output\"] = subset_df[\"no_bl_output\"].apply(lambda s: f\"{s[:max_output_chars]}[...truncated]\")\n",
+    "# subset_df[\"w_bl_output\"] = subset_df[\"w_bl_output\"].apply(lambda s: f\"{s[:max_output_chars]}[...truncated]\")\n",
+    "\n",
+    "# if you dont have the indexx you cant start with brackets\n",
+    "subset_df[\"truncated_input\"] = subset_df[\"truncated_input\"].apply(lambda s: f\"(...){s[-max_prompt_chars:]}\")\n",
+    "subset_df[\"baseline_completion\"] = subset_df[\"baseline_completion\"].apply(lambda s: f\"{s[:max_output_chars]}[...continues]\")\n",
+    "subset_df[\"no_bl_output\"] = subset_df[\"no_bl_output\"].apply(lambda s: f\"{s[:max_output_chars]}[...continues]\")\n",
+    "subset_df[\"w_bl_output\"] = subset_df[\"w_bl_output\"].apply(lambda s: f\"{s[:max_output_chars]}[...continues]\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "slice_size = 2\n",
+    "\n",
+    "# subset_df[cols_to_tabulate][\"avg_spike_entropy\"].describe()[]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_examples = len(subset_df)\n",
+    "midpt = num_examples//5\n",
+    "lower = midpt - (slice_size//2)\n",
+    "upper = midpt + (slice_size//2)+1\n",
+    "\n",
+    "high_entropy_examples = subset_df[cols_to_tabulate].sort_values([\"avg_spike_entropy\"],ascending=True).tail(slice_size)\n",
+    "mid_entropy_examples = subset_df[cols_to_tabulate].sort_values([\"avg_spike_entropy\"],ascending=True).iloc[lower:upper]\n",
+    "low_entropy_examples = subset_df[cols_to_tabulate].sort_values([\"avg_spike_entropy\"],ascending=True).head(slice_size)\n",
+    "\n",
+    "num_examples = len(subset_df)\n",
+    "midpt = num_examples//65\n",
+    "lower = midpt - (slice_size//2)\n",
+    "upper = midpt + (slice_size//2)+1\n",
+    "\n",
+    "high_z_examples = subset_df[cols_to_tabulate].sort_values([\"w_bl_z_score\"],ascending=True).tail(slice_size)\n",
+    "mid_z_examples = subset_df[cols_to_tabulate].sort_values([\"w_bl_z_score\"],ascending=True).iloc[lower:upper]\n",
+    "low_z_examples = subset_df[cols_to_tabulate].sort_values([\"w_bl_z_score\"],ascending=True).head(slice_size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# high_entropy_examples.head()\n",
+    "high_z_examples.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mid_entropy_examples.head()\n",
+    "mid_z_examples.head()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# low_entropy_examples.head()\n",
+    "low_z_examples.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# slices_set_df = pd.concat([high_entropy_examples,low_entropy_examples],axis=0)\n",
+    "slices_set_df = pd.concat([high_z_examples,low_z_examples],axis=0).sort_values(\"w_bl_z_score\",ascending=False)\n",
+    "slices_set_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# slices_set_df.T.iloc[:,0:2]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print(slices_set_df.to_latex(index=False))\n",
+    "# print(low_entropy_examples.to_latex(index=False))\n",
+    "# print(mid_entropy_examples.to_latex(index=False))\n",
+    "# print(high_entropy_examples.to_latex(index=False))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for c,t in zip(low_entropy_examples.columns,low_entropy_examples.dtypes):\n",
+    "#     if t==object:\n",
+    "#         low_entropy_examples[c] = low_entropy_examples[c].apply(lambda s: f\"{s[:100]}[...truncated]\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# low_entropy_examples.T.to_latex(buf=open(\"figs/low_ent_examples.txt\", \"w\"),index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df_to_write = high_entropy_examples\n",
+    "# df_to_write = mid_entropy_examples\n",
+    "# df_to_write = low_entropy_examples\n",
+    "# df_to_write = high_z_examples\n",
+    "# df_to_write = mid_z_examples\n",
+    "# df_to_write = low_z_examples\n",
+    "\n",
+    "cols_to_drop = [\"idx\", \"avg_spike_entropy\", \"no_bl_z_score\"] #, \"no_bl_ppl\", \"w_bl_ppl\"]\n",
+    "df_to_write = slices_set_df.drop(cols_to_drop,axis=1)\n",
+    "\n",
+    "\n",
+    "with pd.option_context(\"max_colwidth\", 1000):\n",
+    "    column_format=\"\".join([(r'p{3cm}|' if t==object else r'p{0.4cm}|') for c,t in zip(df_to_write.columns,df_to_write.dtypes)])[:-1]\n",
+    "    # low_entropy_examples.round(2).to_latex(buf=open(\"figs/low_ent_examples.txt\", \"w\"),column_format=column_format,index=False)\n",
+    "    latex_str = df_to_write.round(2).to_latex(column_format=column_format,index=False)\n",
+    "\n",
+    "print(latex_str)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# column_format=\"\".join([r'p{2cm}|' for c in low_entropy_examples.columns])\n",
+    "# column_format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# low_entropy_examples.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with pd.option_context(\"max_colwidth\", 1000):\n",
+    "    print(grouped_df.get_group((True, 1, 2.0, 0.9)).head(10)[\"w_bl_output\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Set up data for charts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# viz_df = pd.DataFrame()\n",
+    "\n",
+    "# # set the hparam keys, including an indiv column for each you want to ablate on\n",
+    "# viz_df[\"bl_hparams\"] = grouped_df[\"w_bl_exp_whitelist_fraction\"].describe().index.to_list()\n",
+    "# for i,key in enumerate(groupby_fields):\n",
+    "#     viz_df[key] = viz_df[\"bl_hparams\"].apply(lambda tup: tup[i])\n",
+    "\n",
+    "# describe_dict = grouped_df[\"w_bl_whitelist_fraction\"].describe()\n",
+    "# viz_df[\"w_bl_whitelist_fraction_mean\"] = describe_dict[\"mean\"].to_list()\n",
+    "# viz_df[\"w_bl_whitelist_fraction_std\"] = describe_dict[\"std\"].to_list()\n",
+    "\n",
+    "# describe_dict = grouped_df[\"no_bl_whitelist_fraction\"].describe()\n",
+    "# viz_df[\"no_bl_whitelist_fraction_mean\"] = describe_dict[\"mean\"].to_list()\n",
+    "# viz_df[\"no_bl_whitelist_fraction_std\"] = describe_dict[\"std\"].to_list()\n",
+    "\n",
+    "\n",
+    "# describe_dict = grouped_df[\"w_bl_z_score\"].describe()\n",
+    "# viz_df[\"w_bl_z_score_mean\"] = describe_dict[\"mean\"].to_list()\n",
+    "# viz_df[\"w_bl_z_score_std\"] = describe_dict[\"std\"].to_list()\n",
+    "\n",
+    "# describe_dict = grouped_df[\"no_bl_z_score\"].describe()\n",
+    "# viz_df[\"no_bl_z_score_mean\"] = describe_dict[\"mean\"].to_list()\n",
+    "# viz_df[\"no_bl_z_score_std\"] = describe_dict[\"std\"].to_list()\n",
+    "\n",
+    "\n",
+    "# describe_dict = grouped_df[\"w_bl_ppl\"].describe()\n",
+    "# viz_df[\"w_bl_ppl_mean\"] = describe_dict[\"mean\"].to_list()\n",
+    "# viz_df[\"w_bl_ppl_std\"] = describe_dict[\"std\"].to_list()\n",
+    "\n",
+    "# describe_dict = grouped_df[\"no_bl_ppl\"].describe()\n",
+    "# viz_df[\"no_bl_ppl_mean\"] = describe_dict[\"mean\"].to_list()\n",
+    "# viz_df[\"no_bl_ppl_std\"] = describe_dict[\"std\"].to_list()\n",
+    "\n",
+    "# describe_dict = grouped_df[\"avg_spike_entropy\"].describe()\n",
+    "# viz_df[\"avg_spike_entropy_mean\"] = describe_dict[\"mean\"].to_list()\n",
+    "# viz_df[\"avg_spike_entropy_std\"] = describe_dict[\"std\"].to_list()\n",
+    "\n",
+    "# print(f\"groupby legend: {groupby_fields}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # filtering\n",
+    "\n",
+    "# viz_df = viz_df[viz_df[\"bl_hparams\"].apply(lambda tup: (tup[0] == True))] # sampling\n",
+    "\n",
+    "# # viz_df = viz_df[viz_df[\"bl_hparams\"].apply(lambda tup: (tup[0] == False))] # greedy\n",
+    "\n",
+    "\n",
+    "# # fix one of the bl params for analytic chart\n",
+    "# viz_df = viz_df[(viz_df[\"gamma\"]==0.5) & (viz_df[\"delta\"]<=10.0)]\n",
+    "\n",
+    "# # viz_df = viz_df[(viz_df[\"delta\"] > 0.5) & (viz_df[\"delta\"]<=10.0)]\n",
+    "\n",
+    "# # viz_df = viz_df[(viz_df[\"delta\"]==0.5) | (viz_df[\"delta\"]==2.0) | (viz_df[\"delta\"]==10.0)]\n",
+    "\n",
+    "# # viz_df = viz_df[(viz_df[\"delta\"]!=0.1)&(viz_df[\"delta\"]!=0.5)&(viz_df[\"delta\"]!=50.0)]\n",
+    "\n",
+    "# # viz_df = viz_df[(viz_df[\"delta\"]!=50.0)]\n",
+    "# # viz_df = viz_df[(viz_df[\"delta\"]!=50.0) & (viz_df[\"num_beams\"]!=1)]\n",
+    "\n",
+    "# print(len(viz_df))\n",
+    "\n",
+    "# viz_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Visualize the WL/BL hits via highlighting in html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# idx = 75\n",
+    "# # idx = 62\n",
+    "\n",
+    "# # debug\n",
+    "# # idx = 7\n",
+    "# # idx = 18\n",
+    "# # idx = 231\n",
+    "\n",
+    "# print(gen_table_w_bl_stats[idx])\n",
+    "# print(f\"\\nPrompt:\",gen_table_w_bl_stats[idx][\"truncated_input\"])\n",
+    "# print(f\"\\nBaseline (real text):{gen_table_w_bl_stats[idx]['baseline_completion']}\")\n",
+    "# print(f\"\\nNo Blacklist:{gen_table_w_bl_stats[idx]['no_bl_output']}\")\n",
+    "# print(f\"\\nw/ Blacklist:{gen_table_w_bl_stats[idx]['w_bl_output']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from ipymarkup import show_span_box_markup, get_span_box_markup\n",
+    "# from ipymarkup.palette import palette, RED, GREEN, BLUE\n",
+    "\n",
+    "# from IPython.display import display, HTML\n",
+    "\n",
+    "# from transformers import GPT2TokenizerFast\n",
+    "# # fast_tokenizer = GPT2TokenizerFast.from_pretrained(\"gpt2\")\n",
+    "# fast_tokenizer = GPT2TokenizerFast.from_pretrained(\"facebook/opt-2.7b\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %autoreload\n",
+    "\n",
+    "# vis_bl = partial(\n",
+    "#     compute_bl_metrics,\n",
+    "#     tokenizer=fast_tokenizer,\n",
+    "#     hf_model_name=gen_table_meta[\"model_name\"],\n",
+    "#     initial_seed=gen_table_meta[\"initial_seed\"],\n",
+    "#     dynamic_seed=gen_table_meta[\"dynamic_seed\"],\n",
+    "#     bl_proportion=gen_table_meta[\"bl_proportion\"],\n",
+    "#     record_hits = True,\n",
+    "#     use_cuda=True, # this is obvi critical to match the pseudorandomness\n",
+    "# )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# stats = vis_bl(gen_table_w_bl_stats[idx], 0)\n",
+    "\n",
+    "# baseline_hit_list = stats[\"baseline_hit_list\"]\n",
+    "# no_bl_hit_list = stats[\"no_bl_hit_list\"]\n",
+    "# w_bl_hit_list = stats[\"w_bl_hit_list\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# text = stats[\"truncated_input\"]\n",
+    "# fast_encoded = fast_tokenizer(text, truncation=True, max_length=2048)\n",
+    "# hit_list = baseline_hit_list\n",
+    "\n",
+    "# charspans = [fast_encoded.token_to_chars(i) for i in range(len(fast_encoded[\"input_ids\"]))]\n",
+    "# charspans = [cs for cs in charspans if cs is not None]\n",
+    "# # spans = [(cs.start,cs.end, \"PR\") for i,cs in enumerate(charspans)]\n",
+    "# spans = []\n",
+    "\n",
+    "# html = get_span_box_markup(text, spans, palette=palette(PR=BLUE), background='white', text_color=\"black\")\n",
+    "\n",
+    "\n",
+    "# with open(\"figs/prompt_html.html\", \"w\") as f:\n",
+    "#     f.write(HTML(html).data)\n",
+    "\n",
+    "# HTML(html)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# text = stats[\"baseline_completion\"]\n",
+    "# fast_encoded = fast_tokenizer(text, truncation=True, max_length=2048)\n",
+    "# hit_list = baseline_hit_list\n",
+    "\n",
+    "# charspans = [fast_encoded.token_to_chars(i) for i in range(len(fast_encoded[\"input_ids\"]))]\n",
+    "# charspans = [cs for cs in charspans if cs is not None]\n",
+    "# spans = [(cs.start,cs.end, \"BL\") if hit_list[i] else (cs.start,cs.end, \"WL\") for i,cs in enumerate(charspans)]\n",
+    "\n",
+    "# html = get_span_box_markup(text, spans, palette=palette(BL=RED, WL=GREEN), background='white', text_color=\"black\")\n",
+    "\n",
+    "\n",
+    "# with open(\"figs/baseline_html.html\", \"w\") as f:\n",
+    "#     f.write(HTML(html).data)\n",
+    "\n",
+    "# HTML(html)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# text = stats[\"no_bl_output\"]\n",
+    "# fast_encoded = fast_tokenizer(text, truncation=True, max_length=2048)\n",
+    "# hit_list = no_bl_hit_list\n",
+    "\n",
+    "# charspans = [fast_encoded.token_to_chars(i) for i in range(len(fast_encoded[\"input_ids\"]))]\n",
+    "# charspans = [cs for cs in charspans if cs is not None]\n",
+    "# spans = [(cs.start,cs.end, \"BL\") if hit_list[i] else (cs.start,cs.end, \"WL\") for i,cs in enumerate(charspans)]\n",
+    "\n",
+    "# html = get_span_box_markup(text, spans, palette=palette(BL=RED, WL=GREEN), background='white', text_color=\"black\")\n",
+    "\n",
+    "\n",
+    "# with open(\"figs/no_bl_html.html\", \"w\") as f:\n",
+    "#     f.write(HTML(html).data)\n",
+    "\n",
+    "# HTML(html)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# text = stats[\"w_bl_output\"]\n",
+    "# fast_encoded = fast_tokenizer(text, truncation=True, max_length=2048)\n",
+    "# hit_list = w_bl_hit_list\n",
+    "\n",
+    "# charspans = [fast_encoded.token_to_chars(i) for i in range(len(fast_encoded[\"input_ids\"]))]\n",
+    "# charspans = [cs for cs in charspans if cs is not None]\n",
+    "# spans = [(cs.start,cs.end, \"BL\") if hit_list[i] else (cs.start,cs.end, \"WL\") for i,cs in enumerate(charspans)]\n",
+    "\n",
+    "# html = get_span_box_markup(text, spans, palette=palette(BL=RED, WL=GREEN), background='white', text_color=\"black\")\n",
+    "\n",
+    "\n",
+    "# with open(\"figs/w_bl_html.html\", \"w\") as f:\n",
+    "#     f.write(HTML(html).data)\n",
+    "\n",
+    "# HTML(html)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "365524a309ad80022da286f2ec5d2060ce5cb229abb6076cf68d9a1ab14bd8fe"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

lm-watermarking-main/extended_watermark_processor.py ADDED Viewed

	@@ -0,0 +1,625 @@

+# coding=utf-8
+# Copyright 2023 Authors of "A Watermark for Large Language Models"
+# available at https://arxiv.org/abs/2301.10226
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import collections
+from math import sqrt
+from itertools import chain, tee
+from functools import lru_cache
+import scipy.stats
+import torch
+from tokenizers import Tokenizer
+from transformers import LogitsProcessor
+from normalizers import normalization_strategy_lookup
+from alternative_prf_schemes import prf_lookup, seeding_scheme_lookup
+class WatermarkBase:
+    def __init__(
+        self,
+        vocab: list[int] = None,
+        gamma: float = 0.25,
+        delta: float = 2.0,
+        seeding_scheme: str = "selfhash",  # simple default, find more schemes in alternative_prf_schemes.py
+        select_green_tokens: bool = True,  # should always be the default if not running in legacy mode
+    ):
+        # patch now that None could now maybe be passed as seeding_scheme
+        if seeding_scheme is None:
+            seeding_scheme = "selfhash"
+        # Vocabulary setup
+        self.vocab = vocab
+        self.vocab_size = len(vocab)
+        # Watermark behavior:
+        self.gamma = gamma
+        self.delta = delta
+        self.rng = None
+        self._initialize_seeding_scheme(seeding_scheme)
+        # Legacy behavior:
+        self.select_green_tokens = select_green_tokens
+    def _initialize_seeding_scheme(self, seeding_scheme: str) -> None:
+        """Initialize all internal settings of the seeding strategy from a colloquial, "public" name for the scheme."""
+        self.prf_type, self.context_width, self.self_salt, self.hash_key = seeding_scheme_lookup(seeding_scheme)
+    def _seed_rng(self, input_ids: torch.LongTensor) -> None:
+        """Seed RNG from local context. Not batched, because the generators we use (like cuda.random) are not batched."""
+        # Need to have enough context for seed generation
+        if input_ids.shape[-1] < self.context_width:
+            raise ValueError(f"seeding_scheme requires at least a {self.context_width} token prefix to seed the RNG.")
+        prf_key = prf_lookup[self.prf_type](input_ids[-self.context_width :], salt_key=self.hash_key)
+        # enable for long, interesting streams of pseudorandom numbers: print(prf_key)
+        self.rng.manual_seed(prf_key % (2**64 - 1))  # safeguard against overflow from long
+    def _get_greenlist_ids(self, input_ids: torch.LongTensor) -> torch.LongTensor:
+        """Seed rng based on local context width and use this information to generate ids on the green list."""
+        self._seed_rng(input_ids)
+        greenlist_size = int(self.vocab_size * self.gamma)
+        vocab_permutation = torch.randperm(self.vocab_size, device=input_ids.device, generator=self.rng)
+        if self.select_green_tokens:  # directly
+            greenlist_ids = vocab_permutation[:greenlist_size]  # new
+        else:  # select green via red
+            greenlist_ids = vocab_permutation[(self.vocab_size - greenlist_size) :]  # legacy behavior
+        return greenlist_ids
+class WatermarkLogitsProcessor(WatermarkBase, LogitsProcessor):
+    """LogitsProcessor modifying model output scores in a pipe. Can be used in any HF pipeline to modify scores to fit the watermark,
+    but can also be used as a standalone tool inserted for any model producing scores inbetween model outputs and next token sampler.
+    """
+    def __init__(self, *args, store_spike_ents: bool = False, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.store_spike_ents = store_spike_ents
+        self.spike_entropies = None
+        if self.store_spike_ents:
+            self._init_spike_entropies()
+    def _init_spike_entropies(self):
+        alpha = torch.exp(torch.tensor(self.delta)).item()
+        gamma = self.gamma
+        self.z_value = ((1 - gamma) * (alpha - 1)) / (1 - gamma + (alpha * gamma))
+        self.expected_gl_coef = (gamma * alpha) / (1 - gamma + (alpha * gamma))
+        # catch for overflow when bias is "infinite"
+        if alpha == torch.inf:
+            self.z_value = 1.0
+            self.expected_gl_coef = 1.0
+    def _get_spike_entropies(self):
+        spike_ents = [[] for _ in range(len(self.spike_entropies))]
+        for b_idx, ent_tensor_list in enumerate(self.spike_entropies):
+            for ent_tensor in ent_tensor_list:
+                spike_ents[b_idx].append(ent_tensor.item())
+        return spike_ents
+    def _get_and_clear_stored_spike_ents(self):
+        spike_ents = self._get_spike_entropies()
+        self.spike_entropies = None
+        return spike_ents
+    def _compute_spike_entropy(self, scores):
+        # precomputed z value in init
+        probs = scores.softmax(dim=-1)
+        denoms = 1 + (self.z_value * probs)
+        renormed_probs = probs / denoms
+        sum_renormed_probs = renormed_probs.sum()
+        return sum_renormed_probs
+    def _calc_greenlist_mask(self, scores: torch.FloatTensor, greenlist_token_ids) -> torch.BoolTensor:
+        # Cannot lose loop, greenlists might have different lengths
+        green_tokens_mask = torch.zeros_like(scores, dtype=torch.bool)
+        for b_idx, greenlist in enumerate(greenlist_token_ids):
+            if len(greenlist) > 0:
+                green_tokens_mask[b_idx][greenlist] = True
+        return green_tokens_mask
+    def _bias_greenlist_logits(self, scores: torch.Tensor, greenlist_mask: torch.Tensor, greenlist_bias: float) -> torch.Tensor:
+        scores[greenlist_mask] = scores[greenlist_mask] + greenlist_bias
+        return scores
+    def _score_rejection_sampling(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, tail_rule="fixed_compute") -> list[int]:
+        """Generate greenlist based on current candidate next token. Reject and move on if necessary. Method not batched.
+        This is only a partial version of Alg.3 "Robust Private Watermarking", as it always assumes greedy sampling. It will still (kinda)
+        work for all types of sampling, but less effectively.
+        To work efficiently, this function can switch between a number of rules for handling the distribution tail.
+        These are not exposed by default.
+        """
+        sorted_scores, greedy_predictions = scores.sort(dim=-1, descending=True)
+        final_greenlist = []
+        for idx, prediction_candidate in enumerate(greedy_predictions):
+            greenlist_ids = self._get_greenlist_ids(torch.cat([input_ids, prediction_candidate[None]], dim=0))  # add candidate to prefix
+            if prediction_candidate in greenlist_ids:  # test for consistency
+                final_greenlist.append(prediction_candidate)
+            # What follows below are optional early-stopping rules for efficiency
+            if tail_rule == "fixed_score":
+                if sorted_scores[0] - sorted_scores[idx + 1] > self.delta:
+                    break
+            elif tail_rule == "fixed_list_length":
+                if len(final_greenlist) == 10:
+                    break
+            elif tail_rule == "fixed_compute":
+                if idx == 40:
+                    break
+            else:
+                pass  # do not break early
+        return torch.as_tensor(final_greenlist, device=input_ids.device)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        """Call with previous context as input_ids, and scores for next token."""
+        # this is lazy to allow us to co-locate on the watermarked model's device
+        self.rng = torch.Generator(device=input_ids.device) if self.rng is None else self.rng
+        # NOTE, it would be nice to get rid of this batch loop, but currently,
+        # the seed and partition operations are not tensor/vectorized, thus
+        # each sequence in the batch needs to be treated separately.
+        list_of_greenlist_ids = [None for _ in input_ids]  # Greenlists could differ in length
+        for b_idx, input_seq in enumerate(input_ids):
+            if self.self_salt:
+                greenlist_ids = self._score_rejection_sampling(input_seq, scores[b_idx])
+            else:
+                greenlist_ids = self._get_greenlist_ids(input_seq)
+            list_of_greenlist_ids[b_idx] = greenlist_ids
+            # logic for computing and storing spike entropies for analysis
+            if self.store_spike_ents:
+                if self.spike_entropies is None:
+                    self.spike_entropies = [[] for _ in range(input_ids.shape[0])]
+                self.spike_entropies[b_idx].append(self._compute_spike_entropy(scores[b_idx]))
+        green_tokens_mask = self._calc_greenlist_mask(scores=scores, greenlist_token_ids=list_of_greenlist_ids)
+        scores = self._bias_greenlist_logits(scores=scores, greenlist_mask=green_tokens_mask, greenlist_bias=self.delta)
+        return scores
+class WatermarkDetector(WatermarkBase):
+    """This is the detector for all watermarks imprinted with WatermarkLogitsProcessor.
+    The detector needs to be given the exact same settings that were given during text generation  to replicate the watermark
+    greenlist generation and so detect the watermark.
+    This includes the correct device that was used during text generation, the correct tokenizer, the correct
+    seeding_scheme name, and parameters (delta, gamma).
+    Optional arguments are
+    * normalizers ["unicode", "homoglyphs", "truecase"] -> These can mitigate modifications to generated text that could trip the watermark
+    * ignore_repeated_ngrams -> This option changes the detection rules to count every unique ngram only once.
+    * z_threshold -> Changing this threshold will change the sensitivity of the detector.
+    """
+    def __init__(
+        self,
+        *args,
+        device: torch.device = None,
+        tokenizer: Tokenizer = None,
+        z_threshold: float = 4.0,
+        normalizers: list[str] = ["unicode"],  # or also: ["unicode", "homoglyphs", "truecase"]
+        ignore_repeated_ngrams: bool = True,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        # also configure the metrics returned/preprocessing options
+        assert device, "Must pass device"
+        assert tokenizer, "Need an instance of the generating tokenizer to perform detection"
+        self.tokenizer = tokenizer
+        self.device = device
+        self.z_threshold = z_threshold
+        self.rng = torch.Generator(device=self.device)
+        self.normalizers = []
+        for normalization_strategy in normalizers:
+            self.normalizers.append(normalization_strategy_lookup(normalization_strategy))
+        self.ignore_repeated_ngrams = ignore_repeated_ngrams
+    def dummy_detect(
+        self,
+        return_prediction: bool = True,
+        return_scores: bool = True,
+        z_threshold: float = None,
+        return_num_tokens_scored: bool = True,
+        return_num_green_tokens: bool = True,
+        return_green_fraction: bool = True,
+        return_green_token_mask: bool = False,
+        return_all_window_scores: bool = False,
+        return_z_score: bool = True,
+        return_z_at_T: bool = True,
+        return_p_value: bool = True,
+    ):
+        # HF-style output dictionary
+        score_dict = dict()
+        if return_num_tokens_scored:
+            score_dict.update(dict(num_tokens_scored=float("nan")))
+        if return_num_green_tokens:
+            score_dict.update(dict(num_green_tokens=float("nan")))
+        if return_green_fraction:
+            score_dict.update(dict(green_fraction=float("nan")))
+        if return_z_score:
+            score_dict.update(dict(z_score=float("nan")))
+        if return_p_value:
+            z_score = score_dict.get("z_score")
+            if z_score is None:
+                z_score = float("nan")
+            score_dict.update(dict(p_value=float("nan")))
+        if return_green_token_mask:
+            score_dict.update(dict(green_token_mask=[]))
+        if return_all_window_scores:
+            score_dict.update(dict(window_list=[]))
+        if return_z_at_T:
+            score_dict.update(dict(z_score_at_T=torch.tensor([])))
+        output_dict = {}
+        if return_scores:
+            output_dict.update(score_dict)
+        # if passed return_prediction then perform the hypothesis test and return the outcome
+        if return_prediction:
+            z_threshold = z_threshold if z_threshold else self.z_threshold
+            assert z_threshold is not None, "Need a threshold in order to decide outcome of detection test"
+            output_dict["prediction"] = False
+        return output_dict
+    def _compute_z_score(self, observed_count, T):
+        # count refers to number of green tokens, T is total number of tokens
+        expected_count = self.gamma
+        numer = observed_count - expected_count * T
+        denom = sqrt(T * expected_count * (1 - expected_count))
+        z = numer / denom
+        return z
+    def _compute_p_value(self, z):
+        p_value = scipy.stats.norm.sf(z)
+        return p_value
+    @lru_cache(maxsize=2**32)
+    def _get_ngram_score_cached(self, prefix: tuple[int], target: int):
+        """Expensive re-seeding and sampling is cached."""
+        # Handle with care, should ideally reset on __getattribute__ access to self.prf_type, self.context_width, self.self_salt, self.hash_key
+        greenlist_ids = self._get_greenlist_ids(torch.as_tensor(prefix, device=self.device))
+        return True if target in greenlist_ids else False
+    def _score_ngrams_in_passage(self, input_ids: torch.Tensor):
+        """Core function to gather all ngrams in the input and compute their watermark."""
+        if len(input_ids) - self.context_width < 1:
+            raise ValueError(
+                f"Must have at least {1} token to score after "
+                f"the first min_prefix_len={self.context_width} tokens required by the seeding scheme."
+            )
+        # Compute scores for all ngrams contexts in the passage:
+        token_ngram_generator = ngrams(input_ids.cpu().tolist(), self.context_width + 1 - self.self_salt)
+        frequencies_table = collections.Counter(token_ngram_generator)
+        ngram_to_watermark_lookup = {}
+        for idx, ngram_example in enumerate(frequencies_table.keys()):
+            prefix = ngram_example if self.self_salt else ngram_example[:-1]
+            target = ngram_example[-1]
+            ngram_to_watermark_lookup[ngram_example] = self._get_ngram_score_cached(prefix, target)
+        return ngram_to_watermark_lookup, frequencies_table
+    def _get_green_at_T_booleans(self, input_ids, ngram_to_watermark_lookup) -> tuple[torch.Tensor]:
+        """Generate binary list of green vs. red per token, a separate list that ignores repeated ngrams, and a list of offsets to
+        convert between both representations:
+        green_token_mask = green_token_mask_unique[offsets] except for all locations where otherwise a repeat would be counted
+        """
+        green_token_mask, green_token_mask_unique, offsets = [], [], []
+        used_ngrams = {}
+        unique_ngram_idx = 0
+        ngram_examples = ngrams(input_ids.cpu().tolist(), self.context_width + 1 - self.self_salt)
+        for idx, ngram_example in enumerate(ngram_examples):
+            green_token_mask.append(ngram_to_watermark_lookup[ngram_example])
+            if self.ignore_repeated_ngrams:
+                if ngram_example in used_ngrams:
+                    pass
+                else:
+                    used_ngrams[ngram_example] = True
+                    unique_ngram_idx += 1
+                    green_token_mask_unique.append(ngram_to_watermark_lookup[ngram_example])
+            else:
+                green_token_mask_unique.append(ngram_to_watermark_lookup[ngram_example])
+                unique_ngram_idx += 1
+            offsets.append(unique_ngram_idx - 1)
+        return (
+            torch.tensor(green_token_mask),
+            torch.tensor(green_token_mask_unique),
+            torch.tensor(offsets),
+        )
+    def _score_sequence(
+        self,
+        input_ids: torch.Tensor,
+        return_num_tokens_scored: bool = True,
+        return_num_green_tokens: bool = True,
+        return_green_fraction: bool = True,
+        return_green_token_mask: bool = False,
+        return_z_score: bool = True,
+        return_z_at_T: bool = True,
+        return_p_value: bool = True,
+    ):
+        ngram_to_watermark_lookup, frequencies_table = self._score_ngrams_in_passage(input_ids)
+        green_token_mask, green_unique, offsets = self._get_green_at_T_booleans(input_ids, ngram_to_watermark_lookup)
+        # Count up scores over all ngrams
+        if self.ignore_repeated_ngrams:
+            # Method that only counts a green/red hit once per unique ngram.
+            # New num total tokens scored (T) becomes the number unique ngrams.
+            # We iterate over all unqiue token ngrams in the input, computing the greenlist
+            # induced by the context in each, and then checking whether the last
+            # token falls in that greenlist.
+            num_tokens_scored = len(frequencies_table.keys())
+            green_token_count = sum(ngram_to_watermark_lookup.values())
+        else:
+            num_tokens_scored = sum(frequencies_table.values())
+            assert num_tokens_scored == len(input_ids) - self.context_width + self.self_salt
+            green_token_count = sum(freq * outcome for freq, outcome in zip(frequencies_table.values(), ngram_to_watermark_lookup.values()))
+        assert green_token_count == green_unique.sum()
+        # HF-style output dictionary
+        score_dict = dict()
+        if return_num_tokens_scored:
+            score_dict.update(dict(num_tokens_scored=num_tokens_scored))
+        if return_num_green_tokens:
+            score_dict.update(dict(num_green_tokens=green_token_count))
+        if return_green_fraction:
+            score_dict.update(dict(green_fraction=(green_token_count / num_tokens_scored)))
+        if return_z_score:
+            score_dict.update(dict(z_score=self._compute_z_score(green_token_count, num_tokens_scored)))
+        if return_p_value:
+            z_score = score_dict.get("z_score")
+            if z_score is None:
+                z_score = self._compute_z_score(green_token_count, num_tokens_scored)
+            score_dict.update(dict(p_value=self._compute_p_value(z_score)))
+        if return_green_token_mask:
+            score_dict.update(dict(green_token_mask=green_token_mask.tolist()))
+        if return_z_at_T:
+            # Score z_at_T separately:
+            sizes = torch.arange(1, len(green_unique) + 1)
+            seq_z_score_enum = torch.cumsum(green_unique, dim=0) - self.gamma * sizes
+            seq_z_score_denom = torch.sqrt(sizes * self.gamma * (1 - self.gamma))
+            z_score_at_effective_T = seq_z_score_enum / seq_z_score_denom
+            z_score_at_T = z_score_at_effective_T[offsets]
+            assert torch.isclose(z_score_at_T[-1], torch.tensor(z_score))
+            score_dict.update(dict(z_score_at_T=z_score_at_T))
+        return score_dict
+    def _score_windows_impl_batched(
+        self,
+        input_ids: torch.Tensor,
+        window_size: str,
+        window_stride: int = 1,
+    ):
+        # Implementation details:
+        # 1) --ignore_repeated_ngrams is applied globally, and windowing is then applied over the reduced binary vector
+        #      this is only one way of doing it, another would be to ignore bigrams within each window (maybe harder to parallelize that)
+        # 2) These windows on the binary vector of green/red hits, independent of context_width, in contrast to Kezhi's first implementation
+        # 3) z-scores from this implementation cannot be directly converted to p-values, and should only be used as labels for a
+        #    ROC chart that calibrates to a chosen FPR. Due, to windowing, the multiple hypotheses will increase scores across the board#
+        #    naive_count_correction=True is a partial remedy to this
+        ngram_to_watermark_lookup, frequencies_table = self._score_ngrams_in_passage(input_ids)
+        green_mask, green_ids, offsets = self._get_green_at_T_booleans(input_ids, ngram_to_watermark_lookup)
+        len_full_context = len(green_ids)
+        partial_sum_id_table = torch.cumsum(green_ids, dim=0)
+        if window_size == "max":
+            # could start later, small window sizes cannot generate enough power
+            # more principled: solve (T * Spike_Entropy - g * T) / sqrt(T * g * (1 - g)) = z_thresh for T
+            sizes = range(1, len_full_context)
+        else:
+            sizes = [int(x) for x in window_size.split(",") if len(x) > 0]
+        z_score_max_per_window = torch.zeros(len(sizes))
+        cumulative_eff_z_score = torch.zeros(len_full_context)
+        s = window_stride
+        window_fits = False
+        for idx, size in enumerate(sizes):
+            if size <= len_full_context:
+                # Compute hits within window for all positions in parallel:
+                window_score = torch.zeros(len_full_context - size + 1, dtype=torch.long)
+                # Include 0-th window
+                window_score[0] = partial_sum_id_table[size - 1]
+                # All other windows from the 1st:
+                window_score[1:] = partial_sum_id_table[size::s] - partial_sum_id_table[:-size:s]
+                # Now compute batched z_scores
+                batched_z_score_enum = window_score - self.gamma * size
+                z_score_denom = sqrt(size * self.gamma * (1 - self.gamma))
+                batched_z_score = batched_z_score_enum / z_score_denom
+                # And find the maximal hit
+                maximal_z_score = batched_z_score.max()
+                z_score_max_per_window[idx] = maximal_z_score
+                z_score_at_effective_T = torch.cummax(batched_z_score, dim=0)[0]
+                cumulative_eff_z_score[size::s] = torch.maximum(cumulative_eff_z_score[size::s], z_score_at_effective_T[:-1])
+                window_fits = True  # successful computation for any window in sizes
+        if not window_fits:
+            raise ValueError(
+                f"Could not find a fitting window with window sizes {window_size} for (effective) context length {len_full_context}."
+            )
+        # Compute optimal window size and z-score
+        cumulative_z_score = cumulative_eff_z_score[offsets]
+        optimal_z, optimal_window_size_idx = z_score_max_per_window.max(dim=0)
+        optimal_window_size = sizes[optimal_window_size_idx]
+        return (
+            optimal_z,
+            optimal_window_size,
+            z_score_max_per_window,
+            cumulative_z_score,
+            green_mask,
+        )
+    def _score_sequence_window(
+        self,
+        input_ids: torch.Tensor,
+        return_num_tokens_scored: bool = True,
+        return_num_green_tokens: bool = True,
+        return_green_fraction: bool = True,
+        return_green_token_mask: bool = False,
+        return_z_score: bool = True,
+        return_z_at_T: bool = True,
+        return_p_value: bool = True,
+        window_size: str = None,
+        window_stride: int = 1,
+    ):
+        (
+            optimal_z,
+            optimal_window_size,
+            _,
+            z_score_at_T,
+            green_mask,
+        ) = self._score_windows_impl_batched(input_ids, window_size, window_stride)
+        # HF-style output dictionary
+        score_dict = dict()
+        if return_num_tokens_scored:
+            score_dict.update(dict(num_tokens_scored=optimal_window_size))
+        denom = sqrt(optimal_window_size * self.gamma * (1 - self.gamma))
+        green_token_count = int(optimal_z * denom + self.gamma * optimal_window_size)
+        green_fraction = green_token_count / optimal_window_size
+        if return_num_green_tokens:
+            score_dict.update(dict(num_green_tokens=green_token_count))
+        if return_green_fraction:
+            score_dict.update(dict(green_fraction=green_fraction))
+        if return_z_score:
+            score_dict.update(dict(z_score=optimal_z))
+        if return_z_at_T:
+            score_dict.update(dict(z_score_at_T=z_score_at_T))
+        if return_p_value:
+            z_score = score_dict.get("z_score", optimal_z)
+            score_dict.update(dict(p_value=self._compute_p_value(z_score)))
+        # Return per-token results for mask. This is still the same, just scored by windows
+        # todo would be to mark the actually counted tokens differently
+        if return_green_token_mask:
+            score_dict.update(dict(green_token_mask=green_mask.tolist()))
+        return score_dict
+    def detect(
+        self,
+        text: str = None,
+        tokenized_text: list[int] = None,
+        window_size: str = None,
+        window_stride: int = None,
+        return_prediction: bool = True,
+        return_scores: bool = True,
+        z_threshold: float = None,
+        convert_to_float: bool = False,
+        **kwargs,
+    ) -> dict:
+        """Scores a given string of text and returns a dictionary of results."""
+        assert (text is not None) ^ (tokenized_text is not None), "Must pass either the raw or tokenized string"
+        if return_prediction:
+            kwargs["return_p_value"] = True  # to return the "confidence":=1-p of positive detections
+        # run optional normalizers on text
+        for normalizer in self.normalizers:
+            text = normalizer(text)
+        if len(self.normalizers) > 0:
+            print(f"Text after normalization:\n\n{text}\n")
+        if tokenized_text is None:
+            assert self.tokenizer is not None, (
+                "Watermark detection on raw string ",
+                "requires an instance of the tokenizer ",
+                "that was used at generation time.",
+            )
+            tokenized_text = self.tokenizer(text, return_tensors="pt", add_special_tokens=False)["input_ids"][0].to(self.device)
+            if tokenized_text[0] == self.tokenizer.bos_token_id:
+                tokenized_text = tokenized_text[1:]
+        else:
+            # try to remove the bos_tok at beginning if it's there
+            if (self.tokenizer is not None) and (tokenized_text[0] == self.tokenizer.bos_token_id):
+                tokenized_text = tokenized_text[1:]
+        # call score method
+        output_dict = {}
+        if window_size is not None:
+            # assert window_size <= len(tokenized_text) cannot assert for all new types
+            score_dict = self._score_sequence_window(
+                tokenized_text,
+                window_size=window_size,
+                window_stride=window_stride,
+                **kwargs,
+            )
+            output_dict.update(score_dict)
+        else:
+            score_dict = self._score_sequence(tokenized_text, **kwargs)
+        if return_scores:
+            output_dict.update(score_dict)
+        # if passed return_prediction then perform the hypothesis test and return the outcome
+        if return_prediction:
+            z_threshold = z_threshold if z_threshold else self.z_threshold
+            assert z_threshold is not None, "Need a threshold in order to decide outcome of detection test"
+            output_dict["prediction"] = score_dict["z_score"] > z_threshold
+            if output_dict["prediction"]:
+                output_dict["confidence"] = 1 - score_dict["p_value"]
+        # convert any numerical values to float if requested
+        if convert_to_float:
+            for key, value in output_dict.items():
+                if isinstance(value, int):
+                    output_dict[key] = float(value)
+        return output_dict
+##########################################################################
+# Ngram iteration from nltk, extracted to remove the dependency
+# Natural Language Toolkit: Utility functions
+#
+# Copyright (C) 2001-2023 NLTK Project
+# Author: Steven Bird <[email protected]>
+#         Eric Kafe <[email protected]> (acyclic closures)
+# URL: <https://www.nltk.org/>
+# For license information, see https://github.com/nltk/nltk/blob/develop/LICENSE.txt
+##########################################################################
+def ngrams(sequence, n, pad_left=False, pad_right=False, pad_symbol=None):
+    sequence = iter(sequence)
+    if pad_left:
+        sequence = chain((pad_symbol,) * (n - 1), sequence)
+    if pad_right:
+        sequence = chain(sequence, (pad_symbol,) * (n - 1))
+    iterables = tee(sequence, n)
+    for i, sub_iterable in enumerate(iterables):  # For each window,
+        for _ in range(i):  # iterate through every order of ngrams
+            next(sub_iterable, None)  # generate the ngrams within the window.
+    return zip(*iterables)  # Unpack and flattens the iterables.

lm-watermarking-main/homoglyph_data/__init__.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# This is data for homoglyph finding
+"""Original package info:
+Homoglyphs
+* Get similar letters
+* Convert string to ASCII letters
+* Detect possible letter languages
+* Detect letter UTF-8 group.
+# main package info
+__title__ = 'Homoglyphs'
+__version__ = '2.0.4'
+__author__ = 'Gram Orsinium'
+__license__ = 'MIT'
+# License:
+MIT License 2019 orsinium <[email protected]>
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""

lm-watermarking-main/homoglyph_data/categories.json ADDED Viewed

The diff for this file is too large to render. See raw diff

lm-watermarking-main/homoglyph_data/confusables_sept2022.json ADDED Viewed

The diff for this file is too large to render. See raw diff

lm-watermarking-main/homoglyph_data/languages.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+    "ar": "ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ",
+    "be": "ʼЁІЎАБВГДЕЖЗЙКЛМНОПРСТУФХЦЧШЫЬЭЮЯабвгдежзйклмнопрстуфхцчшыьэюяёіў",
+    "bg": "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
+    "ca": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÈÉÍÏÒÓÚÜÇàèéíïòóúüç·",
+    "cz": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÉÍÓÚÝáéíóúýČčĎďĚěŇňŘřŠšŤťŮůŽž",
+    "da": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÅÆØåæø",
+    "de": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÖÜßäöü",
+    "el": "ΪΫΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΐΰϊϋάέήίαβγδεζηθικλμνξοπρςστυφχψωόύώ",
+    "en": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
+    "eo": "ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzĈĉĜĝĤĥĴĵŜŝŬŭ",
+    "es": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÉÍÑÓÚÜáéíñóúü",
+    "et": "ABDEGHIJKLMNOPRSTUVabdeghijklmnoprstuvÄÕÖÜäõöü",
+    "fi": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÅÖäåöŠšŽž",
+    "fr": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÂÇÈÉÊÎÏÙÛàâçèéêîïùûŒœ",
+    "he": "אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
+    "hr": "ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzĆćČčĐđŠšŽž",
+    "hu": "ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzÁÉÍÓÖÚÜáéíóöúüŐőŰű",
+    "it": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÈÉÌÒÓÙàèéìòóù",
+    "lt": "ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzĄąČčĖėĘęĮįŠšŪūŲųŽž",
+    "lv": "ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzĀāČčĒēĢģĪīĶķĻļŅņŠšŪūŽž",
+    "mk": "ЃЅЈЉЊЌЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшѓѕјљњќџ",
+    "nl": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
+    "pl": "ABCDEFGHIJKLMNOPRSTUWYZabcdefghijklmnoprstuwyzÓóĄąĆćĘęŁłŃńŚśŹźŻż",
+    "pt": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÁÂÃÇÉÊÍÓÔÕÚàáâãçéêíóôõú",
+    "ro": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÂÎâîĂăȘșȚț",
+    "ru": "ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё",
+    "sk": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÄÉÍÓÔÚÝáäéíóôúýČčĎďĹĺĽľŇňŔŕŠšŤťŽž",
+    "sl": "ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzČčŠšŽž",
+    "sr": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzЂЈЉЊЋЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшђјљњћџ",
+    "th": "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
+    "tr": "ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÂÇÎÖÛÜâçîöûüĞğİıŞş",
+    "vi": "ABCDEGHIKLMNOPQRSTUVXYabcdeghiklmnopqrstuvxyÂÊÔâêôĂăĐđƠơƯư"
+}

lm-watermarking-main/homoglyphs.py ADDED Viewed

	@@ -0,0 +1,265 @@

+"""Updated version of core.py from
+https://github.com/yamatt/homoglyphs/tree/main/homoglyphs_fork
+for modern python3
+"""
+from collections import defaultdict
+import json
+from itertools import product
+import os
+import unicodedata
+# Actions if char not in alphabet
+STRATEGY_LOAD = 1  # load category for this char
+STRATEGY_IGNORE = 2  # add char to result
+STRATEGY_REMOVE = 3  # remove char from result
+ASCII_RANGE = range(128)
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+DATA_LOCATION = os.path.join(CURRENT_DIR, "homoglyph_data")
+class Categories:
+    """
+    Work with aliases from ISO 15924.
+    https://en.wikipedia.org/wiki/ISO_15924#List_of_codes
+    """
+    fpath = os.path.join(DATA_LOCATION, "categories.json")
+    @classmethod
+    def _get_ranges(cls, categories):
+        """
+        :return: iter: (start code, end code)
+        :rtype: list
+        """
+        with open(cls.fpath, encoding="utf-8") as f:
+            data = json.load(f)
+        for category in categories:
+            if category not in data["aliases"]:
+                raise ValueError("Invalid category: {}".format(category))
+        for point in data["points"]:
+            if point[2] in categories:
+                yield point[:2]
+    @classmethod
+    def get_alphabet(cls, categories):
+        """
+        :return: set of chars in alphabet by categories list
+        :rtype: set
+        """
+        alphabet = set()
+        for start, end in cls._get_ranges(categories):
+            chars = (chr(code) for code in range(start, end + 1))
+            alphabet.update(chars)
+        return alphabet
+    @classmethod
+    def detect(cls, char):
+        """
+        :return: category
+        :rtype: str
+        """
+        with open(cls.fpath, encoding="utf-8") as f:
+            data = json.load(f)
+        # try detect category by unicodedata
+        try:
+            category = unicodedata.name(char).split()[0]
+        except (TypeError, ValueError):
+            # In Python2 unicodedata.name raise error for non-unicode chars
+            # Python3 raise ValueError for non-unicode characters
+            pass
+        else:
+            if category in data["aliases"]:
+                return category
+        # try detect category by ranges from JSON file.
+        code = ord(char)
+        for point in data["points"]:
+            if point[0] <= code <= point[1]:
+                return point[2]
+    @classmethod
+    def get_all(cls):
+        with open(cls.fpath, encoding="utf-8") as f:
+            data = json.load(f)
+        return set(data["aliases"])
+class Languages:
+    fpath = os.path.join(DATA_LOCATION, "languages.json")
+    @classmethod
+    def get_alphabet(cls, languages):
+        """
+        :return: set of chars in alphabet by languages list
+        :rtype: set
+        """
+        with open(cls.fpath, encoding="utf-8") as f:
+            data = json.load(f)
+        alphabet = set()
+        for lang in languages:
+            if lang not in data:
+                raise ValueError("Invalid language code: {}".format(lang))
+            alphabet.update(data[lang])
+        return alphabet
+    @classmethod
+    def detect(cls, char):
+        """
+        :return: set of languages which alphabet contains passed char.
+        :rtype: set
+        """
+        with open(cls.fpath, encoding="utf-8") as f:
+            data = json.load(f)
+        languages = set()
+        for lang, alphabet in data.items():
+            if char in alphabet:
+                languages.add(lang)
+        return languages
+    @classmethod
+    def get_all(cls):
+        with open(cls.fpath, encoding="utf-8") as f:
+            data = json.load(f)
+        return set(data.keys())
+class Homoglyphs:
+    def __init__(
+        self,
+        categories=None,
+        languages=None,
+        alphabet=None,
+        strategy=STRATEGY_IGNORE,
+        ascii_strategy=STRATEGY_IGNORE,
+        ascii_range=ASCII_RANGE,
+    ):
+        # strategies
+        if strategy not in (STRATEGY_LOAD, STRATEGY_IGNORE, STRATEGY_REMOVE):
+            raise ValueError("Invalid strategy")
+        self.strategy = strategy
+        self.ascii_strategy = ascii_strategy
+        self.ascii_range = ascii_range
+        # Homoglyphs must be initialized by any alphabet for correct work
+        if not categories and not languages and not alphabet:
+            categories = ("LATIN", "COMMON")
+        # cats and langs
+        self.categories = set(categories or [])
+        self.languages = set(languages or [])
+        # alphabet
+        self.alphabet = set(alphabet or [])
+        if self.categories:
+            alphabet = Categories.get_alphabet(self.categories)
+            self.alphabet.update(alphabet)
+        if self.languages:
+            alphabet = Languages.get_alphabet(self.languages)
+            self.alphabet.update(alphabet)
+        self.table = self.get_table(self.alphabet)
+    @staticmethod
+    def get_table(alphabet):
+        table = defaultdict(set)
+        with open(os.path.join(DATA_LOCATION, "confusables_sept2022.json")) as f:
+            data = json.load(f)
+        for char in alphabet:
+            if char in data:
+                for homoglyph in data[char]:
+                    if homoglyph in alphabet:
+                        table[char].add(homoglyph)
+        return table
+    @staticmethod
+    def get_restricted_table(source_alphabet, target_alphabet):
+        table = defaultdict(set)
+        with open(os.path.join(DATA_LOCATION, "confusables_sept2022.json")) as f:
+            data = json.load(f)
+        for char in source_alphabet:
+            if char in data:
+                for homoglyph in data[char]:
+                    if homoglyph in target_alphabet:
+                        table[char].add(homoglyph)
+        return table
+    @staticmethod
+    def uniq_and_sort(data):
+        result = list(set(data))
+        result.sort(key=lambda x: (-len(x), x))
+        return result
+    def _update_alphabet(self, char):
+        # try detect languages
+        langs = Languages.detect(char)
+        if langs:
+            self.languages.update(langs)
+            alphabet = Languages.get_alphabet(langs)
+            self.alphabet.update(alphabet)
+        else:
+            # try detect categories
+            category = Categories.detect(char)
+            if category is None:
+                return False
+            self.categories.add(category)
+            alphabet = Categories.get_alphabet([category])
+            self.alphabet.update(alphabet)
+        # update table for new alphabet
+        self.table = self.get_table(self.alphabet)
+        return True
+    def _get_char_variants(self, char):
+        if char not in self.alphabet:
+            if self.strategy == STRATEGY_LOAD:
+                if not self._update_alphabet(char):
+                    return []
+            elif self.strategy == STRATEGY_IGNORE:
+                return [char]
+            elif self.strategy == STRATEGY_REMOVE:
+                return []
+        # find alternative chars for current char
+        alt_chars = self.table.get(char, set())
+        if alt_chars:
+            # find alternative chars for alternative chars for current char
+            alt_chars2 = [self.table.get(alt_char, set()) for alt_char in alt_chars]
+            # combine all alternatives
+            alt_chars.update(*alt_chars2)
+        # add current char to alternatives
+        alt_chars.add(char)
+        # uniq, sort and return
+        return self.uniq_and_sort(alt_chars)
+    def _get_combinations(self, text, ascii=False):
+        variations = []
+        for char in text:
+            alt_chars = self._get_char_variants(char)
+            if ascii:
+                alt_chars = [char for char in alt_chars if ord(char) in self.ascii_range]
+                if not alt_chars and self.ascii_strategy == STRATEGY_IGNORE:
+                    return
+            if alt_chars:
+                variations.append(alt_chars)
+        if variations:
+            for variant in product(*variations):
+                yield "".join(variant)
+    def get_combinations(self, text):
+        return list(self._get_combinations(text))
+    def _to_ascii(self, text):
+        for variant in self._get_combinations(text, ascii=True):
+            if max(map(ord, variant)) in self.ascii_range:
+                yield variant
+    def to_ascii(self, text):
+        return self.uniq_and_sort(self._to_ascii(text))

lm-watermarking-main/normalizers.py ADDED Viewed

	@@ -0,0 +1,208 @@

+""" Text-based normalizers, used to mitigate simple attacks against watermarking.
+This implementation is unlikely to be a complete list of all possible exploits within the unicode standard,
+it represents our best effort at the time of writing.
+These normalizers can be used as stand-alone normalizers. They could be made to conform to HF tokenizers standard, but that would
+require messing with the limited rust interface of tokenizers.NormalizedString
+"""
+from collections import defaultdict
+from functools import cache
+import re
+import unicodedata
+import homoglyphs as hg
+def normalization_strategy_lookup(strategy_name: str) -> object:
+    if strategy_name == "unicode":
+        return UnicodeSanitizer()
+    elif strategy_name == "homoglyphs":
+        return HomoglyphCanonizer()
+    elif strategy_name == "truecase":
+        return TrueCaser()
+class HomoglyphCanonizer:
+    """Attempts to detect homoglyph attacks and find a consistent canon.
+    This function does so on a per-ISO-category level. Language-level would also be possible (see commented code).
+    """
+    def __init__(self):
+        self.homoglyphs = None
+    def __call__(self, homoglyphed_str: str) -> str:
+        # find canon:
+        target_category, all_categories = self._categorize_text(homoglyphed_str)
+        homoglyph_table = self._select_canon_category_and_load(target_category, all_categories)
+        return self._sanitize_text(target_category, homoglyph_table, homoglyphed_str)
+    def _categorize_text(self, text: str) -> dict:
+        iso_categories = defaultdict(int)
+        # self.iso_languages = defaultdict(int)
+        for char in text:
+            iso_categories[hg.Categories.detect(char)] += 1
+            # for lang in hg.Languages.detect(char):
+            #     self.iso_languages[lang] += 1
+        target_category = max(iso_categories, key=iso_categories.get)
+        all_categories = tuple(iso_categories)
+        return target_category, all_categories
+    @cache
+    def _select_canon_category_and_load(
+        self, target_category: str, all_categories: tuple[str]
+    ) -> dict:
+        homoglyph_table = hg.Homoglyphs(
+            categories=(target_category, "COMMON")
+        )  # alphabet loaded here from file
+        source_alphabet = hg.Categories.get_alphabet(all_categories)
+        restricted_table = homoglyph_table.get_restricted_table(
+            source_alphabet, homoglyph_table.alphabet
+        )  # table loaded here from file
+        return restricted_table
+    def _sanitize_text(
+        self, target_category: str, homoglyph_table: dict, homoglyphed_str: str
+    ) -> str:
+        sanitized_text = ""
+        for char in homoglyphed_str:
+            # langs = hg.Languages.detect(char)
+            cat = hg.Categories.detect(char)
+            if target_category in cat or "COMMON" in cat or len(cat) == 0:
+                sanitized_text += char
+            else:
+                sanitized_text += list(homoglyph_table[char])[0]
+        return sanitized_text
+class UnicodeSanitizer:
+    """Regex-based unicode sanitzer. Has different levels of granularity.
+    * ruleset="whitespaces"    - attempts to remove only whitespace unicode characters
+    * ruleset="IDN.blacklist"  - does its best to remove unusual unicode based on  Network.IDN.blacklist characters
+    * ruleset="ascii"          - brute-forces all text into ascii
+    This is unlikely to be a comprehensive list.
+    You can find a more comprehensive discussion at https://www.unicode.org/reports/tr36/
+    and https://www.unicode.org/faq/security.html
+    """
+    def __init__(self, ruleset="whitespaces"):
+        if ruleset == "whitespaces":
+            """Documentation:
+            \u00A0: Non-breaking space
+            \u1680: Ogham space mark
+            \u180E: Mongolian vowel separator
+            \u2000-\u200B: Various space characters, including en space, em space, thin space, hair space, zero-width space, and zero-width non-joiner
+            \u200C\u200D: Zero-width non-joiner and zero-width joiner
+            \u200E,\u200F: Left-to-right-mark, Right-to-left-mark
+            \u2060: Word joiner
+            \u2063: Invisible separator
+            \u202F: Narrow non-breaking space
+            \u205F: Medium mathematical space
+            \u3000: Ideographic space
+            \uFEFF: Zero-width non-breaking space
+            \uFFA0: Halfwidth hangul filler
+            \uFFF9\uFFFA\uFFFB: Interlinear annotation characters
+            \uFE00-\uFE0F: Variation selectors
+            \u202A-\u202F: Embedding characters
+            \u3164: Korean hangul filler.
+            Note that these characters are not always superfluous whitespace characters!
+            """
+            self.pattern = re.compile(
+                r"[\u00A0\u1680\u180E\u2000-\u200B\u200C\u200D\u200E\u200F\u2060\u2063\u202F\u205F\u3000\uFEFF\uFFA0\uFFF9\uFFFA\uFFFB"
+                r"\uFE00\uFE01\uFE02\uFE03\uFE04\uFE05\uFE06\uFE07\uFE08\uFE09\uFE0A\uFE0B\uFE0C\uFE0D\uFE0E\uFE0F\u3164\u202A\u202B\u202C\u202D"
+                r"\u202E\u202F]"
+            )
+        elif ruleset == "IDN.blacklist":
+            """Documentation:
+            [\u00A0\u1680\u180E\u2000-\u200B\u202F\u205F\u2060\u2063\uFEFF]: Matches any whitespace characters in the Unicode character
+                        set that are included in the IDN blacklist.
+            \uFFF9-\uFFFB: Matches characters that are not defined in Unicode but are used as language tags in various legacy encodings.
+                        These characters are not allowed in domain names.
+            \uD800-\uDB7F: Matches the first part of a surrogate pair. Surrogate pairs are used to represent characters in the Unicode character
+                        set that cannot be represented by a single 16-bit value. The first part of a surrogate pair is in the range U+D800 to U+DBFF,
+                        and the second part is in the range U+DC00 to U+DFFF.
+            \uDB80-\uDBFF][\uDC00-\uDFFF]?: Matches the second part of a surrogate pair. The second part of a surrogate pair is in the range U+DC00
+                        to U+DFFF, and is optional.
+            [\uDB40\uDC20-\uDB40\uDC7F][\uDC00-\uDFFF]: Matches certain invalid UTF-16 sequences which should not appear in IDNs.
+            """
+            self.pattern = re.compile(
+                r"[\u00A0\u1680\u180E\u2000-\u200B\u202F\u205F\u2060\u2063\uFEFF\uFFF9-\uFFFB\uD800-\uDB7F\uDB80-\uDBFF]"
+                r"[\uDC00-\uDFFF]?|[\uDB40\uDC20-\uDB40\uDC7F][\uDC00-\uDFFF]"
+            )
+        else:
+            """Documentation:
+            This is a simple restriction to "no-unicode", using only ascii characters. Control characters are included.
+            """
+            self.pattern = re.compile(r"[^\x00-\x7F]+")
+    def __call__(self, text: str) -> str:
+        text = unicodedata.normalize("NFC", text)  # canon forms
+        text = self.pattern.sub(" ", text)  # pattern match
+        text = re.sub(" +", " ", text)  # collapse whitespaces
+        text = "".join(
+            c for c in text if unicodedata.category(c) != "Cc"
+        )  # Remove any remaining non-printable characters
+        return text
+class TrueCaser:
+    """True-casing, is a capitalization normalization that returns text to its original capitalization.
+    This defends against attacks that wRIte TeXt lIkE spOngBoB.
+    Here, a simple POS-tagger is used.
+    """
+    uppercase_pos = ["PROPN"]  # Name POS tags that should be upper-cased
+    def __init__(self, backend="spacy"):
+        if backend == "spacy":
+            import spacy
+            self.nlp = spacy.load("en_core_web_sm")
+            self.normalize_fn = self._spacy_truecasing
+        else:
+            from nltk import pos_tag, word_tokenize  # noqa
+            import nltk
+            nltk.download("punkt")
+            nltk.download("averaged_perceptron_tagger")
+            nltk.download("universal_tagset")
+            self.normalize_fn = self._nltk_truecasing
+    def __call__(self, random_capitalized_string: str) -> str:
+        truecased_str = self.normalize_fn(random_capitalized_string)
+        return truecased_str
+    def _spacy_truecasing(self, random_capitalized_string: str):
+        doc = self.nlp(random_capitalized_string.lower())
+        POS = self.uppercase_pos
+        truecased_str = "".join(
+            [
+                w.text_with_ws.capitalize() if w.pos_ in POS or w.is_sent_start else w.text_with_ws
+                for w in doc
+            ]
+        )
+        return truecased_str
+    def _nltk_truecasing(self, random_capitalized_string: str):
+        from nltk import pos_tag, word_tokenize
+        import nltk
+        nltk.download("punkt")
+        nltk.download("averaged_perceptron_tagger")
+        nltk.download("universal_tagset")
+        POS = ["NNP", "NNPS"]
+        tagged_text = pos_tag(word_tokenize(random_capitalized_string.lower()))
+        truecased_str = " ".join([w.capitalize() if p in POS else w for (w, p) in tagged_text])
+        return truecased_str

lm-watermarking-main/pyproject.toml ADDED Viewed

	@@ -0,0 +1,6 @@

+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+[tool.black]
+line-length = 140

lm-watermarking-main/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+nltk
+scipy
+torch
+transformers
+tokenizers

lm-watermarking-main/setup.cfg ADDED Viewed

	@@ -0,0 +1,68 @@

+[metadata]
+name = lm-watermarking
+version = 0.1.0
+author = Authors of 'A Watermark for Large Language Models'
+author_email = [email protected]
+url = https://github.com/jwkirchenbauer/lm-watermarking
+description = Implementation of watermark algorithms for large language models.
+long_description = file: README.md, LICENSE.md
+long_description_content_type = text/markdown
+license = Apache 2.0
+license_file = LICENSE.md
+platform = any
+keywords = Machine Learning, NLP, Language Models, Watermark, Safety, Model Output Detection
+classifiers =
+    Topic :: Security
+    License :: OSI Approved :: Apache 2.0
+    Operating System :: OS Independent
+    Programming Language :: Python
+homepage = https://github.com/jwkirchenbauer/lm-watermarking
+repository = https://github.com/jwkirchenbauer/lm-watermarking
+documentation = https://arxiv.org/abs/2301.10226
+[options]
+zip_safe = False
+include_package_data = True
+python_requires = >= 3.9
+packages = find:
+setup_requires =
+    setuptools
+install_requires =
+    nltk
+    scipy
+    torch
+    transformers
+    tokenizers
+[tool.black]
+line-length = 140
+[check-manifest]
+ignore =
+    .ipynb
+    .sh
+#inspired by https://github.com/pytorch/pytorch/blob/master/.flake8
+[flake8]
+select = B,C,E,F,P,T4,W,B9
+max-line-length = 140
+extend-ignore = E203
+ignore =
+    E203,E305,E402,E501,E721,E741,F821,F841,F999,W503,W504,C408,E302,W291,E303,
+    # shebang has extra meaning in fbcode lints, so I think it's not worth trying
+    # to line this up with executable bit
+    EXE001,
+    # these ignores are from flake8-bugbear; please fix!
+    B007,B008,
+    # these ignores are from flake8-comprehensions; please fix!
+    C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415
+    #unignored: F403,F405,
+    D102,D103,D403 # for doc linting
+  exclude =
+      .git
+      __pycache__
+      log/*

lm-watermarking-main/watermark_processor.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# coding=utf-8
+# Copyright 2023 Authors of "A Watermark for Large Language Models"
+# available at https://arxiv.org/abs/2301.10226
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import collections
+from math import sqrt
+import scipy.stats
+import torch
+from torch import Tensor
+from tokenizers import Tokenizer
+from transformers import LogitsProcessor
+from nltk.util import ngrams
+from normalizers import normalization_strategy_lookup
+class WatermarkBase:
+    def __init__(
+        self,
+        vocab: list[int] = None,
+        gamma: float = 0.5,
+        delta: float = 2.0,
+        seeding_scheme: str = "simple_1",  # mostly unused/always default
+        hash_key: int = 15485863,  # just a large prime number to create a rng seed with sufficient bit width
+        select_green_tokens: bool = True,
+    ):
+        # watermarking parameters
+        self.vocab = vocab
+        self.vocab_size = len(vocab)
+        self.gamma = gamma
+        self.delta = delta
+        self.seeding_scheme = seeding_scheme
+        self.rng = None
+        self.hash_key = hash_key
+        self.select_green_tokens = select_green_tokens
+    def _seed_rng(self, input_ids: torch.LongTensor, seeding_scheme: str = None) -> None:
+        # can optionally override the seeding scheme,
+        # but uses the instance attr by default
+        if seeding_scheme is None:
+            seeding_scheme = self.seeding_scheme
+        if seeding_scheme == "simple_1":
+            assert input_ids.shape[-1] >= 1, f"seeding_scheme={seeding_scheme} requires at least a 1 token prefix sequence to seed rng"
+            prev_token = input_ids[-1].item()
+            self.rng.manual_seed(self.hash_key * prev_token)
+        else:
+            raise NotImplementedError(f"Unexpected seeding_scheme: {seeding_scheme}")
+        return
+    def _get_greenlist_ids(self, input_ids: torch.LongTensor) -> list[int]:
+        # seed the rng using the previous tokens/prefix
+        # according to the seeding_scheme
+        self._seed_rng(input_ids)
+        greenlist_size = int(self.vocab_size * self.gamma)
+        vocab_permutation = torch.randperm(self.vocab_size, device=input_ids.device, generator=self.rng)
+        if self.select_green_tokens:  # directly
+            greenlist_ids = vocab_permutation[:greenlist_size]  # new
+        else:  # select green via red
+            greenlist_ids = vocab_permutation[(self.vocab_size - greenlist_size) :]  # legacy behavior
+        return greenlist_ids
+class WatermarkLogitsProcessor(WatermarkBase, LogitsProcessor):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def _calc_greenlist_mask(self, scores: torch.FloatTensor, greenlist_token_ids) -> torch.BoolTensor:
+        # TODO lets see if we can lose this loop
+        green_tokens_mask = torch.zeros_like(scores)
+        for b_idx in range(len(greenlist_token_ids)):
+            green_tokens_mask[b_idx][greenlist_token_ids[b_idx]] = 1
+        final_mask = green_tokens_mask.bool()
+        return final_mask
+    def _bias_greenlist_logits(self, scores: torch.Tensor, greenlist_mask: torch.Tensor, greenlist_bias: float) -> torch.Tensor:
+        scores[greenlist_mask] = scores[greenlist_mask] + greenlist_bias
+        return scores
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # this is lazy to allow us to colocate on the watermarked model's device
+        if self.rng is None:
+            self.rng = torch.Generator(device=input_ids.device)
+        # NOTE, it would be nice to get rid of this batch loop, but currently,
+        # the seed and partition operations are not tensor/vectorized, thus
+        # each sequence in the batch needs to be treated separately.
+        batched_greenlist_ids = [None for _ in range(input_ids.shape[0])]
+        for b_idx in range(input_ids.shape[0]):
+            greenlist_ids = self._get_greenlist_ids(input_ids[b_idx])
+            batched_greenlist_ids[b_idx] = greenlist_ids
+        green_tokens_mask = self._calc_greenlist_mask(scores=scores, greenlist_token_ids=batched_greenlist_ids)
+        scores = self._bias_greenlist_logits(scores=scores, greenlist_mask=green_tokens_mask, greenlist_bias=self.delta)
+        return scores
+class WatermarkDetector(WatermarkBase):
+    def __init__(
+        self,
+        *args,
+        device: torch.device = None,
+        tokenizer: Tokenizer = None,
+        z_threshold: float = 4.0,
+        normalizers: list[str] = ["unicode"],  # or also: ["unicode", "homoglyphs", "truecase"]
+        ignore_repeated_bigrams: bool = True,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        # also configure the metrics returned/preprocessing options
+        assert device, "Must pass device"
+        assert tokenizer, "Need an instance of the generating tokenizer to perform detection"
+        self.tokenizer = tokenizer
+        self.device = device
+        self.z_threshold = z_threshold
+        self.rng = torch.Generator(device=self.device)
+        if self.seeding_scheme == "simple_1":
+            self.min_prefix_len = 1
+        else:
+            raise NotImplementedError(f"Unexpected seeding_scheme: {self.seeding_scheme}")
+        self.normalizers = []
+        for normalization_strategy in normalizers:
+            self.normalizers.append(normalization_strategy_lookup(normalization_strategy))
+        self.ignore_repeated_bigrams = ignore_repeated_bigrams
+        if self.ignore_repeated_bigrams:
+            assert self.seeding_scheme == "simple_1", "No repeated bigram credit variant assumes the single token seeding scheme."
+    def _compute_z_score(self, observed_count, T):
+        # count refers to number of green tokens, T is total number of tokens
+        expected_count = self.gamma
+        numer = observed_count - expected_count * T
+        denom = sqrt(T * expected_count * (1 - expected_count))
+        z = numer / denom
+        return z
+    def _compute_p_value(self, z):
+        p_value = scipy.stats.norm.sf(z)
+        return p_value
+    def _score_sequence(
+        self,
+        input_ids: Tensor,
+        return_num_tokens_scored: bool = True,
+        return_num_green_tokens: bool = True,
+        return_green_fraction: bool = True,
+        return_green_token_mask: bool = False,
+        return_z_score: bool = True,
+        return_p_value: bool = True,
+    ):
+        if self.ignore_repeated_bigrams:
+            # Method that only counts a green/red hit once per unique bigram.
+            # New num total tokens scored (T) becomes the number unique bigrams.
+            # We iterate over all unqiue token bigrams in the input, computing the greenlist
+            # induced by the first token in each, and then checking whether the second
+            # token falls in that greenlist.
+            assert return_green_token_mask is False, "Can't return the green/red mask when ignoring repeats."
+            bigram_table = {}
+            token_bigram_generator = ngrams(input_ids.cpu().tolist(), 2)
+            freq = collections.Counter(token_bigram_generator)
+            num_tokens_scored = len(freq.keys())
+            for idx, bigram in enumerate(freq.keys()):
+                prefix = torch.tensor([bigram[0]], device=self.device)  # expects a 1-d prefix tensor on the randperm device
+                greenlist_ids = self._get_greenlist_ids(prefix)
+                bigram_table[bigram] = True if bigram[1] in greenlist_ids else False
+            green_token_count = sum(bigram_table.values())
+        else:
+            num_tokens_scored = len(input_ids) - self.min_prefix_len
+            if num_tokens_scored < 1:
+                raise ValueError(
+                    (
+                        f"Must have at least {1} token to score after "
+                        f"the first min_prefix_len={self.min_prefix_len} tokens required by the seeding scheme."
+                    )
+                )
+            # Standard method.
+            # Since we generally need at least 1 token (for the simplest scheme)
+            # we start the iteration over the token sequence with a minimum
+            # num tokens as the first prefix for the seeding scheme,
+            # and at each step, compute the greenlist induced by the
+            # current prefix and check if the current token falls in the greenlist.
+            green_token_count, green_token_mask = 0, []
+            for idx in range(self.min_prefix_len, len(input_ids)):
+                curr_token = input_ids[idx]
+                greenlist_ids = self._get_greenlist_ids(input_ids[:idx])
+                if curr_token in greenlist_ids:
+                    green_token_count += 1
+                    green_token_mask.append(True)
+                else:
+                    green_token_mask.append(False)
+        score_dict = dict()
+        if return_num_tokens_scored:
+            score_dict.update(dict(num_tokens_scored=num_tokens_scored))
+        if return_num_green_tokens:
+            score_dict.update(dict(num_green_tokens=green_token_count))
+        if return_green_fraction:
+            score_dict.update(dict(green_fraction=(green_token_count / num_tokens_scored)))
+        if return_z_score:
+            score_dict.update(dict(z_score=self._compute_z_score(green_token_count, num_tokens_scored)))
+        if return_p_value:
+            z_score = score_dict.get("z_score")
+            if z_score is None:
+                z_score = self._compute_z_score(green_token_count, num_tokens_scored)
+            score_dict.update(dict(p_value=self._compute_p_value(z_score)))
+        if return_green_token_mask:
+            score_dict.update(dict(green_token_mask=green_token_mask))
+        return score_dict
+    def detect(
+        self,
+        text: str = None,
+        tokenized_text: list[int] = None,
+        return_prediction: bool = True,
+        return_scores: bool = True,
+        z_threshold: float = None,
+        **kwargs,
+    ) -> dict:
+        assert (text is not None) ^ (tokenized_text is not None), "Must pass either the raw or tokenized string"
+        if return_prediction:
+            kwargs["return_p_value"] = True  # to return the "confidence":=1-p of positive detections
+        # run optional normalizers on text
+        for normalizer in self.normalizers:
+            text = normalizer(text)
+        if len(self.normalizers) > 0:
+            print(f"Text after normalization:\n\n{text}\n")
+        if tokenized_text is None:
+            assert self.tokenizer is not None, (
+                "Watermark detection on raw string ",
+                "requires an instance of the tokenizer ",
+                "that was used at generation time.",
+            )
+            tokenized_text = self.tokenizer(text, return_tensors="pt", add_special_tokens=False)["input_ids"][0].to(self.device)
+            if tokenized_text[0] == self.tokenizer.bos_token_id:
+                tokenized_text = tokenized_text[1:]
+        else:
+            # try to remove the bos_tok at beginning if it's there
+            if (self.tokenizer is not None) and (tokenized_text[0] == self.tokenizer.bos_token_id):
+                tokenized_text = tokenized_text[1:]
+        # call score method
+        output_dict = {}
+        score_dict = self._score_sequence(tokenized_text, **kwargs)
+        if return_scores:
+            output_dict.update(score_dict)
+        # if passed return_prediction then perform the hypothesis test and return the outcome
+        if return_prediction:
+            z_threshold = z_threshold if z_threshold else self.z_threshold
+            assert z_threshold is not None, "Need a threshold in order to decide outcome of detection test"
+            output_dict["prediction"] = score_dict["z_score"] > z_threshold
+            if output_dict["prediction"]:
+                output_dict["confidence"] = 1 - score_dict["p_value"]
+        return output_dict

lm-watermarking-main/watermark_reliability_release/PIPELINE.md ADDED Viewed

	@@ -0,0 +1,154 @@

+# Usage document for pipeline
+6/7/23: Will be updated and built out as required.
+## (1) **generate** a bunch of samples
+The point of all this code is to construct pairwise examples
+of human text, unwatermarked, and watermarked text in something
+resembling an unbiased or IID manner, despite the difficulty of this ask.
+The key functionality is _oversampling_. A series of arguments control how
+the raw datasets samples are turned into prompts, and then, provided
+the raw prompts pass some checks, the prompts are
+fed to the model, and the number of tokens naturally generated under normal
+decoding, as well as watermark decoding. If the generations match the given
+(length) output filtering criteria, then the row "counts" as one of the `N`
+requested samples.
+Otherwise, the generations are stored, but the global counter of progress
+towards `N`, is not incremented, and thus this "overhead" is the cost
+of being very restrictive in desiring "square" (`N` x `T`) shaped table of samples
+in that all three of the human text, unwatermarked, and watermarked output columns
+always have the same tokenized length.
+At evaluation time, by default, all the point estimates, means, and ROC and AUC calculations are performed
+on the subset of rows that all have about the target length (i.e. a subset with shape ~ `N` x `T`).
+The `generation_pipeline.py` call in `run_pipeline.sh` demonstrates the basic usage.
+### Key arguments controlling the oversampling logic...
+### 'Shape' Controls
+- `max_new_tokens`: an upperbound, i.e. target length `T=200`
+- `min_prompt_tokens` : prompt len lower bound such as 50
+- `min_generations` : the number of 'good' samples we'd like, ie `N=500`
+### Prompt construction strategy
+- `input_truncation_strategy`
+One in `["completion_length", "prompt_length"]`. If the former, slices the end
+`max_new_tokens` off of the raw sample to create the 'prompt' with the leading prefix (which can have variable length), making the `max_new_tokens` removed, the `baseline_completion`, or gold output.
+If the latter, selects the leading `min_prompt_tokens` off of the raw sample as the prompt,
+leaving the remaining tokens (variable length) the `baseline_completion`.
+### Filtering/oversampling criteria
+- `input_filtering_strategy`: Can be one of `["completion_length", "prompt_length", "prompt_and_completion_length"]`.
+In each case, if the relevant field doesn't meet the minimum criteria given by
+`max_new_tokens` or `min_prompt_tokens` respectively, then the raw sample is thrown
+away before ever even being fed to the model.
+- `output_filtering_strategy`: Can be one in `["no_filter", "max_new_tokens"]`, if the former, then no output filtering
+is performed after generations are sampled from the model. However, if `max_new_tokens`
+then each both the unwatermarked and watermarked generations are checked to ensure that
+they are at least `max_new_tokens` long.
+This is a subtle way of trying to adaptively collect samples (online, from any dataset) such that eventually we end up with at least a subset that matches the squareness (`N` x `T`) criteria we desire, without _forcing_ this to happen on every sample
+by turning off the EOS token which amounts to a potentially
+pathological distribution shift in the unwatermarked and watermarked output distributions
+which would potentially confound generality of results.
+Other generation args descriptions are explained by their argparse defintions, but these in particular control the watermarking:
+- `seeding_scheme`: the watermarking embedding scheme being used, such as `lefthash` (formerly `simple_1`) or `selfhash` (formerly `algorithm-3` in reference to previous paper)
+- `gamma`: parameter controlling size of the green partition for watermarking
+- `delta`: parameter controlling how much bias is added to the green token logits before sampling
+---
+## (2) Optionally, apply an **attack** transformation to weaken the watermark, or make detection harder (for non-watermarking methods as well).
+We implement three types of attacks in this pipeline: `gpt`, `dipper`,  and `copy-paste`.
+The key parameters for each are as follows:
+- `gpt`:
+    -  `attack_model_name`: the OpenAI model variant to use
+    -  `attack_prompt_id` : the index of the prompt to use, see `utils/prompts.json`
+    -  `no_wm_attack`: whether to attack the un-watermarked generation column (`no_wm_output`).
+                       Default is the watermarked generation (`w_wm_output`)
+- `dipper`:
+    - `lex`: lexical diversity knob for the dipper model/method
+    - `order`: order diversity knob for the paraphrase attack
+- `copy-paste`:
+    - `cp_attack_type`: k-t means `k` insertions of length `t`
+    - `cp_attack_num_insertions`: `k` spec'd as an integer
+    - `cp_attack_insertion_len`: `t` but generally spec'd as a percent of the full starting sequence length (i.e `25%`)
+    - `cp_attack_src_col` : the sequence we're taking the tokens "to be detected" from , i.e. "positive" examples for
+                            the detector of interest. for watermarking this is `w_wm_output`
+    - `cp_attack_dst_col` : the sequence we treat as "negative" surrounding context for the detector of interest. for watermarking this is `no_wm_output`.
+All parameters have an associated help string in their argparse definition.
+The `attack_pipeline.py` call in `run_pipeline.sh` demonstrates the basic usage of the attack functionality.
+---
+## (3) Run **evaluation** and watermark detection
+This batches the process of applying a combination of metric
+functions to the dataset of generations (jsonl) and returns a
+new dataset of generations (jsonl) just with extra columns for a bunch of metrics.
+This is separated from the generation phase to allow a given set of
+expensive generations to be reanalyzed in differnet ways with differnet metric
+flavors as necessary.
+The key parameters controlling metrics:
+Key parameters and usage notes for detection:
+- `evaluation_metrics`: a comma sep list of metrics to evaluate, such as `p-sp,repetition,diversity,z-score,windowed-z-score`
+- `window_settings`: if running windowed detection specs the comma sep'd windowing strategies (such as `20,40,max`)
+- `retrieval_technique`: if running retrieval detection, whether to use the `sim` or `bm25` strategy
+All (other) parameters have a help string in their argparse definition.
+The `evaluation_pipeline.py` call in `run_pipeline.sh` demonstrates the basic usage.
+### Argument union and precedence
+First, all arguments used at generation time (metadata file) are loaded by the
+evaluation pipeline. Then the commandline args that were passed to the eval pipeline
+are added via an update, or "overwriting union" operator, where all new args for
+evaluation only are added to the current metadata object, but those that were
+also present at generation time are _**overwritten**_ by those included in the
+evaluation argparse.
+If they match, then this is standard behavior. Overwriting shared arguments
+is disabled via the `overwrite_args` flag by default, but can be allowed this way.
+Additionally, the code writes the metrics file into the same directory as the
+generations file if only `input_dir` is passed. However, for safety clarity and organization,
+one can pass an output dir in which to write the new dataset with metrics, as well
+as the evaluation metadata as demonstrated in the `run_pipeline.sh` example.
+---
+## (3.1) Retrieval and DetectGPT detection
+### Creating **prefixes**:
+**Retrieval** detection is implemented as a metric, i.e. it is run by the evaluation script. To perform retrieval detection on full examples, nothing extra is required. To run retrieval at T, you first must run `broadcast_token_prefixes.py` with the `save_per_prefix` argument as `False` and with a `prefix_stride` of choice, such as 50, with a clean generation or attacked generation directory (with `jsonl` and meta file inside) as input. This will create a version of the dataset (new `jsonl` file) that contains all of the original rows, duplicated and then sliced to each prefix length defined by iterating by `prefix_stride` in the sequence length dimension.
+For ex, if you have a file with `N=500` rows of length about `T=200` each, then running this script with `prefix_stride=50` would create a new file with `N=2000` where the first `500` rows all have length `50`, the next `500` have length `100` etc. If a given row say length `119` is too short for prefix length `i`, say the 3rd slice size in this example, `150`, then in the third block, it would be marked as `None`. This is to avoid any prefix block expected to be totally comprising a certain prefix length from containing a bunch of sequnces that are shorter than expected which confounds the measurement.
+Now for **DetectGPT** a separate script, `detectgpt/detectgpt_main.py`, must be run pointing at a clean generation or attacked generation `jsonl` file. Additionally, to run detectgpt @ T, similar prefixing logic must be used. However, it must be run with `save_per_prefix` as `True` this time, which then creates a set of new files, each containing all the rows of the input `jsonl` file but trucated to each prefix length as described above. Then each run of the detectgpt script produces a new `jsonl` file (of length `N=500` in the above example) with the detectgpt score column added. Then, the notebook `join_jsonl_prefix_files.ipynb` can be used to join all those separate jsonl files for each individual prefix into one full file (`N=2000`).
+### Running **detection**
+For Retrieval detection, all that is necessary is to run the evaluation script on the `jsonl` containing all the prefixes, and point estimates for the detection at each prefix length will be created by grouping by the prefix length column and reducing. Note, the retrieval method will load only the full sequences into the retrieval database (by loading only the longest sample for each original row, so just `500` sequences in our example), but will query, or perform detection using all of the different prefixes.
+For DetectGPT, the evaluation script must also be run, but with the `evaluation_metrics=detectgpt` alone, and no other metrics. This is because most of the script is a no-op at this point as every row already contains a detectgpt score and they just need to be turned into ROC plots or AUC measurements. As with retrieval detection, these will be automatically grouped by prefix length and reduced.

lm-watermarking-main/watermark_reliability_release/README.md ADDED Viewed

	@@ -0,0 +1,27 @@

+# 💧2.0: [On the Reliability of Watermarks for Large Language Models](https://arxiv.org/abs/2306.04634)
+This directory contains the codebase for reproducing the experiments in our [new 6/7/23 preprint](https://arxiv.org/abs/2306.04634).
+### **NOTE**: this is a preliminary release, so please expect some small changes in the future as required.
+---
+The watermarking and watermark detection code itself is an extension of the `WatermarkLogitsProcessor` and `WatermarkDetector` classes released as part of the original work and contained in the root of the repository. Additional logic implementing a wider array of seeding schemes and alternate detection strategies is included and depended upon by the extended versions of the classes in this directory.
+To facilitate the broader array of experiments required for this study, an extra pipeline abstraction was implemented to manage the "generation", paraphrase "attack", and "evaluation" or detection phases. The general setup is that data, i.e. sets of generated samples, is written and read by each stage as "json lines" files `*.jsonl` with associated metadata files `*.json` to keep track of parameter settings used at each stage.
+A prose version of usage instructions for the pipeline is described in a separate markdown file here: [PIPELINE.md](PIPELINE.md)
+## wandb
+The pipeline scripts, and in particular, the evaluation stage where detection is run and generation quality metrics are computed, are configured to push results to weights and biases (wandb). The figures in the paper are produced by:
+1. sketching out the charts in wandb using filters and tags
+2. exporting/downloading the csv's of the data for each chart, and
+3. loading them in a notebook to format plots as necessary.
+Alternately, the evaluation stage also saves a jsonl file where every line is a set of generations and all associated metrics and detection scores computed for it. This can also be loaded and analyzed manually in pandas, though the ROC space analyzes and average@T series for some metrics will have to be recomputed.
+## llama
+In order to use the llama model, you need to bring-your-own-weights, and then covert them to the huggingface format.

lm-watermarking-main/watermark_reliability_release/alternative_prf_schemes.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""Implement other PRF functions, so, hashing schemes.
+Can be hooked into existing WatermarkLogitsProcessor as modified base class WatermarkBase
+"""
+import torch
+from itertools import combinations
+from functools import cache
+# Key properties of a hashing scheme
+props = {
+    "prf_type": str,  # string name of the underlying PRF mapping multiple token ids to a random seed
+    "context_width": int,  # this is h in the paper, how many previous tokens should be considered for each PRF
+    "self_salt": bool,  # Use the rules laid in robust-watermarking to use the token itself to seed and possibly reject its own list
+    "hash_key": int,  # integer, large prime, used to move seed away from low-entrop bit sequences in PRF chosen above
+}
+def seeding_scheme_lookup(seeding_scheme: str):
+    if not isinstance(seeding_scheme, str):
+        raise ValueError("Seeding scheme should be a string summarizing the procedure.")
+    if seeding_scheme == "simple_1" or seeding_scheme == "lefthash":
+        # Default, simple bigram hash  # alias for ff-additive_prf-1-False-15485863
+        prf_type = "additive_prf"
+        context_width = 1
+        self_salt = False
+        hash_key = 15485863
+    elif seeding_scheme == "algorithm-3" or seeding_scheme == "selfhash":
+        prf_type = "anchored_minhash_prf"
+        context_width = 4
+        self_salt = True
+        hash_key = 15485863
+    elif seeding_scheme == "skipgram":
+        prf_type = "skipgram_prf"
+        context_width = 5
+        self_salt = False
+        hash_key = 15485863
+    elif seeding_scheme.startswith(
+        "ff"
+    ):  # freeform seeding scheme API - only use for experimenting
+        # expects strings of the form ff-additive_prf-4-True-hash or ff-additive_prf-5-True (hash key is optional)
+        split_scheme = seeding_scheme.split("-")
+        prf_type = str(split_scheme[1])
+        context_width = int(split_scheme[2])
+        self_salt = split_scheme[3] == "True"
+        if len(split_scheme) == 5:
+            hash_key = int(split_scheme[4])
+        else:
+            hash_key = 15485863
+    else:
+        raise ValueError(f"Invalid seeding scheme name {seeding_scheme} given. Try  'simple_1'?")
+    assert prf_type in prf_lookup.keys()
+    return prf_type, context_width, self_salt, hash_key
+def multiplicative_prf(input_ids: torch.LongTensor, salt_key: int) -> int:
+    return salt_key * input_ids.prod().item()
+def additive_prf(input_ids: torch.LongTensor, salt_key: int) -> int:
+    return salt_key * input_ids.sum().item()
+def minfunc_prf(input_ids: torch.LongTensor, salt_key: int) -> int:
+    # not a great idea for non-random input ids as in text
+    return salt_key * input_ids.min().item()
+def simple_skip_prf(input_ids: torch.LongTensor, salt_key: int, k=2) -> int:
+    # k is the skip distance
+    return hashint(salt_key * input_ids[::k]).prod().item()
+def skipgram_prf(input_ids: torch.LongTensor, salt_key: int) -> int:
+    # maximum distance skipgram within context
+    return hashint(salt_key * input_ids[0]).item()
+def anchored_skipgram_prf(input_ids: torch.LongTensor, salt_key: int, anchor: int = -1) -> int:
+    # maximum distance skipgram within context
+    return (hashint(salt_key * input_ids[0]) * hashint(salt_key * input_ids[anchor])).item()
+def minhash_prf(input_ids: torch.LongTensor, salt_key: int) -> int:
+    # slightly less not the greatest idea for non-random input ids as in text
+    return hashint(salt_key * input_ids).min().item()
+def anchored_minhash_prf(input_ids: torch.LongTensor, salt_key: int, anchor: int = -1) -> int:
+    # Anchor to one key to produce a min over pairs again
+    return (salt_key * hashint(input_ids) * hashint(input_ids[anchor])).min().item()
+def minskipgram_prf(input_ids: torch.LongTensor, salt_key: int, k: int = 2) -> int:
+    # min over all skipgrams in context, k=2 is all pairs
+    skipgrams = torch.as_tensor(list(combinations(hashint(salt_key * input_ids), 2)))
+    return skipgrams.prod(dim=1).min().item()
+def noncomm_prf(input_ids: torch.LongTensor, salt_key: int, k: int = 2) -> int:
+    key = torch.as_tensor(salt_key, dtype=torch.long)
+    for entry in input_ids:
+        key *= hashint(key * entry)
+        key %= 2**32
+    return key.item()
+def position_prf(input_ids: torch.LongTensor, salt_key: int, k: int = 2) -> int:
+    return (
+        (salt_key * input_ids * torch.arange(1, len(input_ids) + 1, device=input_ids.device))
+        .sum()
+        .item()
+    )
+prf_lookup = {
+    "multiplicative_prf": multiplicative_prf,
+    "additive_prf": additive_prf,
+    "minfunc_prf": minfunc_prf,
+    "simple_skip_prf": simple_skip_prf,
+    "skipgram_prf": skipgram_prf,
+    "anchored_skipgram_prf": anchored_skipgram_prf,
+    "minhash_prf": minhash_prf,
+    "anchored_minhash_prf": anchored_minhash_prf,
+    "minskipgram_prf": minskipgram_prf,
+    "noncomm_prf": noncomm_prf,
+    "position_prf": position_prf,
+}
+# Generate a global permute table once at startup
+rng = torch.Generator(device=torch.device("cpu"))
+rng.manual_seed(2971215073)  # fib47 is prime
+table_size = 1_000_003
+fixed_table = torch.randperm(
+    1_000_003, device=torch.device("cpu"), generator=rng
+)  # actually faster than I thought
+def hashint(integer_tensor: torch.LongTensor) -> torch.LongTensor:
+    """Sane version, in the end we only need a small permutation table."""
+    return (
+        fixed_table[integer_tensor.cpu() % table_size] + 1
+    )  # minor cheat here, this function always return CPU values
+def _hashint_avalanche_tensor(integer_tensor: torch.LongTensor):
+    """http://burtleburtle.net/bob/hash/integer.html, ported into pytorch, runs on tensors. Apparently a decent avalanche."""
+    i = integer_tensor.to(torch.int32).clone()  # or torch.int16?
+    i -= i << 6
+    i ^= i >> 17
+    i -= i << 9
+    i ^= i << 4
+    i -= i << 3
+    i ^= i << 10
+    i ^= i >> 15
+    return i.to(torch.long)
+@cache
+def _hashint_avalanche_int(integer: int):
+    """http://burtleburtle.net/bob/hash/integer.html, runs in base python, caches based on access.
+    Does this make sense for signed 64bit ints?"""
+    i = integer % (2**32)
+    i -= i << 6
+    i ^= i >> 17
+    i -= i << 9
+    i ^= i << 4
+    i -= i << 3
+    i ^= i << 10
+    i ^= i >> 15
+    return i

lm-watermarking-main/watermark_reliability_release/attack_pipeline.py ADDED Viewed

	@@ -0,0 +1,506 @@

+# coding=utf-8
+# Copyright 2023 Authors of "A Watermark for Large Language Models"
+# available at https://arxiv.org/abs/2301.10226
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import argparse
+from functools import partial
+from tqdm import tqdm
+import wandb
+from datasets import Dataset
+from utils.submitit import str2bool  # better bool flag type for argparse
+from utils.io import read_jsonlines, read_json, write_json, write_jsonlines
+from utils.evaluation import NO_CHECK_ARGS, load_tokenizer
+from utils.attack import (
+    SUPPORTED_ATTACK_METHODS,
+    gpt_attack,
+    dipper_attack,
+    tokenize_for_copy_paste,
+    copy_paste_attack,
+    scramble_attack,
+)
+print(f"Current huggingface cache dir: {os.environ['HF_HOME']}")
+def main(args):
+    ###########################################################################
+    # Create output dir if it doesn't exist, and warn if it contains an
+    # attacked generations file
+    ###########################################################################
+    gen_table_attacked_path = f"{args.output_dir}/gen_table_attacked.jsonl"
+    attacked_meta_path = f"{args.output_dir}/gen_table_attacked_meta.json"
+    print(f"Output dir for this run: {args.output_dir}")
+    # notify if exists
+    if os.path.exists(args.output_dir):
+        print(f"Output dir for this run already exists!")
+        print(f"Contents: {sorted(os.listdir(args.output_dir))}")
+        # warn if metrics file exists
+        if os.path.exists(gen_table_attacked_path):
+            if not args.overwrite_output_file:
+                print(
+                    f"WARNING: Exiting to avoid overwriting output file. "
+                    f"Pass the '--overwrite_output_file' flag to ignore this check."
+                )
+                exit()
+            else:
+                print(
+                    f"WARNING: Found existing generation files with metrics added at this output dir. "
+                    f"Overwriting anyway :/"
+                )
+    else:
+        # create the output dir where run artifacts are stored
+        os.makedirs(args.output_dir)
+    ###########################################################################
+    # Parse attack_method arg
+    ###########################################################################
+    # check that attack method is supported
+    assert (
+        args.attack_method in SUPPORTED_ATTACK_METHODS
+    ), f"Unsupported attack '{args.attack_method}'"
+    print(f"Attack method: {args.attack_method}")
+    ###########################################################################
+    # Load generations
+    ###########################################################################
+    print(f"Input dir for this run: {args.input_dir}")
+    print(f"Loading previously generated outputs for attacking ...")
+    gen_table_meta_path = f"{args.input_dir}/gen_table_meta.json"
+    gen_table_path = f"{args.input_dir}/gen_table.jsonl"
+    safe_gen_table_path = f"{args.input_dir}/gen_table_safe.jsonl"
+    assert os.path.exists(
+        gen_table_meta_path
+    ), f"failed file check for prev generations metadata json file: {gen_table_meta_path}"
+    assert os.path.exists(
+        gen_table_path
+    ), f"failed file check for prev generations jsonl file: {gen_table_path}"
+    assert not os.path.exists(safe_gen_table_path), (
+        f"failed for safety bc there is a secondary 'safe' marked file",
+        f" in this dir indicating a possible issue with the generation step. ",
+    )
+    cmdline_args = args.__dict__.copy()
+    prev_gen_table_meta = read_json(gen_table_meta_path)
+    joined_args = prev_gen_table_meta.copy()
+    joined_args.update(cmdline_args)
+    # check that the args used to generate the prev generations are the same as
+    # the current args, for the intersection of keys
+    if not args.overwrite_args:
+        for key in prev_gen_table_meta.keys():
+            if key in NO_CHECK_ARGS:
+                continue
+            assert joined_args[key] == prev_gen_table_meta[key], (
+                f"failed for safety bc after merging the prev metadata with "
+                f"the current cmdline args, values for '{key}' are not the same. "
+                f"in metadata: {prev_gen_table_meta[key]}, passed: {cmdline_args[key]}. "
+                f"Pass the '--overwrite_args' flag to ignore this check."
+            )
+    args = argparse.Namespace(**joined_args)
+    gen_table = [ex for ex in read_jsonlines(gen_table_path)]
+    gen_table_ds = Dataset.from_list(gen_table[: args.limit_rows])
+    ###########################################################################
+    # Start logging, we wait to do this until after loading the generations
+    # so that we can log the args used to generate them unioned with the
+    # cmdline args
+    ###########################################################################
+    # storing slurm info to allow auditing logfiles
+    # note this is set after the metadata check to ignore overwriting
+    args.SLURM_JOB_ID = os.getenv("SLURM_JOB_ID")
+    args.SLURM_ARRAY_JOB_ID = os.getenv("SLURM_ARRAY_JOB_ID")
+    args.SLURM_ARRAY_TASK_ID = os.getenv("SLURM_ARRAY_TASK_ID")
+    if args.wandb:
+        # start a new wandb run to track this experiment, will send data to it
+        run = wandb.init(
+            # set the wandb project where this run will be logged
+            project=args.wandb_project,
+            entity=args.wandb_entity,
+            name=f"{args.run_name}",
+            # track hyperparameters and run metadata
+            config=args,
+            tags=args.wandb_tags,
+        )
+    ###########################################################################
+    # GPT attack
+    ###########################################################################
+    if args.attack_method == "gpt":
+        print("Running GPT attack")
+        import openai
+        openai.api_key = os.environ["OPENAI_API_KEY"]
+        prompt_pool = read_json("utils/prompts.json")["prompt_pool"]
+        prompt_pool = {int(k): v for k, v in prompt_pool.items()}
+        if args.attack_prompt is None:
+            attack_prompt = prompt_pool[args.attack_prompt_id]
+            args.attack_prompt = attack_prompt
+        print(f"Using attack prompt: {attack_prompt}")
+        gpt_attack_partial = partial(
+            gpt_attack,
+            attack_prompt=attack_prompt,
+            args=args,
+        )
+        # gen_table_attacked_ds = gen_table_ds.map(
+        #     gpt_attack_partial, batched=False, num_proc=min(len(gen_table_ds), 16)
+        # )
+        gen_table_attacked_ds = gen_table_ds.map(gpt_attack_partial, batched=False)
+    ###########################################################################
+    # DIPPER attack
+    ###########################################################################
+    elif args.attack_method == "dipper":
+        print("Running DIPPER attack")
+        print(f"Using lexical diversity: {args.lex}, order diversity: {args.order}")
+        gen_table_attacked_ds = dipper_attack(
+            gen_table_ds, lex=args.lex, order=args.order, args=args
+        )
+    ###########################################################################
+    # Scramble attack
+    ###########################################################################
+    elif args.attack_method == "scramble":
+        #  if no cp_attack_min_len specified, use args.max_new_tokens
+        if args.cp_attack_min_len == 0:
+            args.cp_attack_min_len = args.max_new_tokens
+        tokenizer = load_tokenizer(args)
+        scramble_attack_partial = partial(
+            scramble_attack,
+            tokenizer=tokenizer,
+            args=args,
+        )
+        gen_table_attacked_ds = gen_table_ds.map(scramble_attack_partial, batched=False)
+    ###########################################################################
+    # Copy-paste attack
+    ###########################################################################
+    elif args.attack_method == "copy-paste":
+        #  if no cp_attack_min_len specified, use args.max_new_tokens
+        if args.cp_attack_min_len == 0:
+            args.cp_attack_min_len = args.max_new_tokens
+        # NOTE FIXME: the above arg indicates the filter condition by which
+        # some rows are skipped/not attacked/NOOP. Since the attacked col
+        # is set to the empty string, and length 0, the detection code
+        # including the baselines 🤞🏼 will ignore these rows one way or another
+        # convert cp_attack_insertion_len to int
+        if "%" in args.cp_attack_insertion_len:
+            original_len_str = args.cp_attack_insertion_len
+            # treat as a percent of 1 minus the length of the source col
+            # effectively how much of the source col "remains", accounting for
+            # the number of insertions that will be made to total this length
+            args.cp_attack_insertion_len = (
+                int((int(args.cp_attack_insertion_len[:-1]) / 100) * args.max_new_tokens)
+                // args.cp_attack_num_insertions
+            )
+            # check that this is not more than args.max_new_tokens total
+            assert (
+                args.cp_attack_insertion_len * args.cp_attack_num_insertions <= args.max_new_tokens
+            ) and (
+                args.cp_attack_insertion_len * args.cp_attack_num_insertions > 0
+            ), f"Invalid attack strength: {original_len_str} for {args.cp_attack_num_insertions} insertions."
+            args.cp_attack_effective_attack_percentage = (
+                1 - (int(original_len_str[:-1]) / 100)
+            ) * 100
+            print(
+                f"Effective attack percentage is 1-{original_len_str}={args.cp_attack_effective_attack_percentage}% by "
+                f"copying {args.cp_attack_num_insertions} x {args.cp_attack_insertion_len} = {args.cp_attack_num_insertions * args.cp_attack_insertion_len} tokens "
+                f"from {args.cp_attack_src_col} to {args.cp_attack_dst_col} where T={args.max_new_tokens}"
+            )
+        else:
+            args.cp_attack_insertion_len = int(args.cp_attack_insertion_len)
+            args.cp_attack_effective_attack_percentage = (
+                1
+                - (
+                    (args.cp_attack_insertion_len * args.cp_attack_num_insertions)
+                    / args.max_new_tokens
+                )
+            ) * 100
+            print(
+                f"Effective attack percentage is {args.cp_attack_effective_attack_percentage}% by "
+                f"copying {args.cp_attack_num_insertions} x {args.cp_attack_insertion_len} = {args.cp_attack_num_insertions * args.cp_attack_insertion_len} tokens "
+                f"from {args.cp_attack_src_col} to {args.cp_attack_dst_col} where T={args.max_new_tokens}"
+            )
+        tokenizer = load_tokenizer(args)
+        tokenize_for_copy_paste_partial = partial(tokenize_for_copy_paste, tokenizer=tokenizer)
+        gen_table_tokd_ds = gen_table_ds.map(tokenize_for_copy_paste_partial, batched=False)
+        copy_paste_attack_partial = partial(copy_paste_attack, tokenizer=tokenizer, args=args)
+        gen_table_attacked_ds = gen_table_tokd_ds.map(copy_paste_attack_partial, batched=False)
+    ###########################################################################
+    # Write the final dataset out to disk in jsonl format
+    # with the metrics added
+    ###########################################################################
+    else:
+        raise ValueError(f"Invalid attack method: {args.attack_method}")
+    # write the metadata file, which is a union of the previous metadata
+    # and the current cmdline args
+    write_json(args.__dict__, attacked_meta_path, indent=4)
+    gen_table_attacked_lst = [ex for ex in gen_table_attacked_ds]
+    write_jsonlines(gen_table_attacked_lst, gen_table_attacked_path)
+    ###########################################################################
+    # Log the data/series to wandb
+    ###########################################################################
+    # log the metrics to wandb
+    if args.wandb:
+        # find cols that should be logged in a table
+        tabular_column_types = ["string", "bool"]
+        tabular_column_names = [
+            name
+            for name, _ in filter(
+                lambda tup: tup[1].dtype in tabular_column_types,
+                gen_table_attacked_ds.features.items(),
+            )
+        ]
+        # the rest should be logged as series
+        series_column_names = [
+            name
+            for name, _ in filter(
+                lambda tup: tup[1].dtype not in tabular_column_types,
+                gen_table_attacked_ds.features.items(),
+            )
+        ]
+        for metric_name in series_column_names:
+            # summarize series metrics as mean by default
+            wandb.define_metric(metric_name, summary="mean")
+        # log the raw series
+        for example in tqdm(
+            gen_table_attacked_ds.remove_columns(tabular_column_names),
+            desc="Logging series metrics to wandb",
+        ):
+            run.log(example)
+        # log the raw tabular data
+        # but also include the dataset index as a column
+        series_column_names.remove("idx")
+        table = wandb.Table(
+            dataframe=gen_table_attacked_ds.remove_columns(series_column_names).to_pandas()
+        )
+        run.log({"output_table": table})
+        # finish the wandb run
+        run.finish()
+    return
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run evaluation pipeline for watermark detection")
+    parser.add_argument(
+        "--attack_method",
+        type=str,
+        choices=SUPPORTED_ATTACK_METHODS,
+        default="gpt",
+        help="The attack method to use.",
+    )
+    parser.add_argument(
+        "--attack_model_name",
+        type=str,
+        default="gpt-3.5-turbo",
+    )
+    parser.add_argument(
+        "--attack_temperature",
+        type=float,
+        default=0.7,
+    )
+    parser.add_argument(
+        "--attack_max_tokens",
+        type=int,
+        default=1000,
+    )
+    parser.add_argument(
+        "--attack_prompt_id",
+        type=int,
+        default=4,
+    )
+    parser.add_argument(
+        "--attack_prompt",
+        type=str,
+        default=None,
+        help="Pass in the prompt to use for the attack. Is loaded by id from utils/prompts.json by default.",
+    )
+    parser.add_argument(
+        "--no_wm_attack",
+        type=str2bool,
+        default=False,
+        help="Whether to attack the no_wm_output column when running gpt or dipper.",
+    )
+    parser.add_argument(
+        "--overwrite_args",
+        type=str2bool,
+        default=False,
+        help="Whether to overwrite the shared args in the metadata file with the current, runtime args.",
+    )
+    parser.add_argument(
+        "--wandb",
+        type=str2bool,
+        default=False,
+        help="Whether to log to wandb.",
+    )
+    parser.add_argument(
+        "--wandb_project",
+        type=str,
+        default="lm-watermarking",
+        help="The name of the wandb project.",
+    )
+    parser.add_argument(
+        "--wandb_entity",
+        type=str,
+        default="jwkirchenbauer",
+        help="The wandb entity/user for the project.",
+    )
+    parser.add_argument(
+        "--wandb_tags",
+        type=str,
+        default="",
+        help="The comma separated list of tags to add to the wandb run.",
+    )
+    parser.add_argument(
+        "--run_name",
+        type=str,
+        default=None,
+        help="The unique name for the run.",
+    )
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        default="./input",
+        help="The directory containing the input files.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help=(
+            "The directory in which to write out the dataset after adding the metrics. "
+            "If not specified, will use the input_dir. Note, if the output_dir already "
+            "contains the metric-enriched file, it will be overwritten :/"
+        ),
+    )
+    parser.add_argument(
+        "--overwrite_output_file",
+        type=str2bool,
+        default=False,
+        help="Whether to overwrite the output file if it already exists.",
+    )
+    parser.add_argument(
+        "--limit_rows",
+        type=int,
+        default=None,
+        help="The number of rows to limit the dataset to. Useful for debugging.",
+    )
+    parser.add_argument(
+        "--verbose",
+        type=str2bool,
+        default=False,
+        help="Whether to print verbose output of every attack.",
+    )
+    parser.add_argument(
+        "--lex",
+        type=int,
+        default=20,
+        help="Lexical diversity knob for the paraphrase attack.",
+    )
+    parser.add_argument(
+        "--order",
+        type=int,
+        default=0,
+        help="Order diversity knob for the paraphrase attack.",
+    )
+    parser.add_argument(
+        "--cp_attack_type",
+        type=str,
+        default="single-single",
+        choices=["single-single", "triple-single", "k-t"],
+        help="Type of copy-paste attack to be run.",
+    )
+    parser.add_argument(
+        "--cp_attack_min_len",
+        type=int,
+        default=0,
+        help="Minimum length of cols for the copy-paste attack to be run.",
+    )
+    parser.add_argument(
+        "--cp_attack_num_insertions",
+        type=int,
+        default=3,
+        help="Length of the insertion for the copy-paste attack.",
+    )
+    parser.add_argument(
+        "--cp_attack_insertion_len",
+        type=str,
+        default="20",
+        help=(
+            f"Length of the insertion for the copy-paste attack. "
+            f"Converts to int. Unless expressed as a percentage, "
+            f"in which case it refers to what percent of src is copied to dst, "
+            f"which is 1-attack strength as a percentage."
+        ),
+    )
+    parser.add_argument(
+        "--cp_attack_src_col",
+        type=str,
+        default="w_wm_output",
+        help="Source column for the copy-paste attack.",
+    )
+    parser.add_argument(
+        "--cp_attack_dst_col",
+        type=str,
+        default="no_wm_output",
+        help="Destination column for the copy-paste attack.",
+    )
+    args = parser.parse_args()
+    ###########################################################################
+    # Argument validation and conditional setting
+    ###########################################################################
+    assert args.attack_method, "attack_method must be specified"
+    # if no output dir specified, use the input dir
+    if args.output_dir is None:
+        args.output_dir = args.input_dir
+    # check limit_rows
+    assert (args.limit_rows is None) or (
+        (args.limit_rows > 0) and isinstance(args.limit_rows, int)
+    ), "limit_rows must be > 0 or None"
+    # split wandb tags
+    if args.wandb_tags != "":
+        args.wandb_tags = args.wandb_tags.split(",")
+    else:
+        args.wandb_tags = []
+    main(args)

lm-watermarking-main/watermark_reliability_release/broadcast_token_prefixes.py ADDED Viewed

	@@ -0,0 +1,436 @@

+# coding=utf-8
+# Copyright 2023 Authors of "A Watermark for Large Language Models"
+# available at https://arxiv.org/abs/2301.10226
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+note = "Note: this script should be moved to/run from the same dir as the `utils` subdir lives in to work properly"
+print(note)
+import os
+import argparse
+from functools import partial
+from tqdm import tqdm
+import wandb
+import torch
+import numpy as np
+from datasets import Dataset, concatenate_datasets
+from utils.submitit import str2bool  # better bool flag type for argparse
+from utils.io import read_jsonlines, read_json, write_json, write_jsonlines
+from utils.evaluation import load_tokenizer, NO_CHECK_ARGS
+from utils.generation import tokenize_only
+print(f"Current huggingface cache dir: {os.environ['HF_HOME']}")
+def main(args):
+    ###########################################################################
+    # Load generations
+    ###########################################################################
+    print(f"Input dir for this run: {args.input_dir}")
+    print(f"Loading previously generated outputs for evaluation via oracle model and metrics...")
+    # check for the "attacked version" of the gen table first
+    gen_table_meta_path = f"{args.input_dir}/gen_table_attacked_meta.json"
+    gen_table_path = f"{args.input_dir}/gen_table_attacked.jsonl"
+    safe_gen_table_path = f"{args.input_dir}/gen_table_attacked_safe.jsonl"
+    attack_variants_exist = [
+        os.path.exists(gen_table_meta_path),
+        os.path.exists(gen_table_path),
+    ]
+    found_attacked_files = all(attack_variants_exist)
+    if not found_attacked_files:
+        gen_table_meta_path = f"{args.input_dir}/gen_table_meta.json"
+        gen_table_path = f"{args.input_dir}/gen_table.jsonl"
+        safe_gen_table_path = f"{args.input_dir}/gen_table_safe.jsonl"
+        assert os.path.exists(
+            gen_table_meta_path
+        ), f"failed file check for prev generations metadata json file: {gen_table_meta_path}"
+        assert os.path.exists(
+            gen_table_path
+        ), f"failed file check for prev generations jsonl file: {gen_table_path}"
+    assert not os.path.exists(safe_gen_table_path), (
+        f"failed for safety bc there is a secondary 'safe' marked file",
+        f" in this dir indicating a possible issue with the generation step. ",
+    )
+    cmdline_args = args.__dict__.copy()
+    prev_gen_table_meta = read_json(gen_table_meta_path)
+    joined_args = prev_gen_table_meta.copy()
+    for k, v in cmdline_args.items():
+        if v is not None or (not k in joined_args):
+            joined_args.update({k: v})
+        else:
+            print(
+                f"cmdline arg {k} is None, leaving it as the value found in the input metadata (or None): {prev_gen_table_meta.get(k)}"
+            )
+    # check that the args used to generate the prev generations are the same as
+    # the current args, for the intersection of keys
+    for key in prev_gen_table_meta.keys():
+        if key in NO_CHECK_ARGS:
+            continue
+        assert joined_args[key] == prev_gen_table_meta[key], (
+            f"failed for safety bc after merging the prev metadata with "
+            f"the current cmdline args, values for '{key}' are not the same. "
+            f"in metadata: {prev_gen_table_meta[key]}, passed: {cmdline_args[key]}. "
+            f"Pass the '--overwrite_args' flag to ignore this check."
+        )
+    args = argparse.Namespace(**joined_args)
+    gen_table = [ex for ex in read_jsonlines(gen_table_path)]
+    gen_table_ds = Dataset.from_list(gen_table[: args.limit_rows])
+    ######## length filtering: only keeps the samples of exact N tokens ########
+    df = gen_table_ds.to_pandas()
+    original_len = len(df)
+    print(f"Origianl #samples: {original_len}")
+    if args.filter_length:
+        df = df[
+            (df["baseline_completion_length"] == args.max_new_tokens)
+            & (df["no_wm_output_length"] == args.max_new_tokens)
+            & (df["w_wm_output_length"] == args.max_new_tokens)
+        ]
+        # TODO: filter length for the attacked output
+        print(f" after filtering token length: {len(df)}")
+    gen_table_ds = Dataset.from_pandas(df)
+    ###########################################################################
+    # Prefix list logic
+    ###########################################################################
+    from utils.generation import tokenize_and_truncate
+    print(f"Generating prefixes for the gen table...")
+    # load the tokenizer
+    tokenizer = load_tokenizer(args)
+    def generate_prefix(example, prefix_length=None, text_col_names=None, tokenizer=None):
+        assert prefix_length is not None, "prefix_length must be specified"
+        assert text_col_names is not None and isinstance(
+            text_col_names, list
+        ), "text_col_names must be a list of column names"
+        # make a copy of the example
+        example = example.copy()
+        tokd_column_data = {}
+        for text_col_name in text_col_names:
+            try:
+                # check that the col exists
+                assert text_col_name in example, f"text_col_name '{text_col_name}' not in example"
+                # check whether the prefix is OOB for this example
+                # NOTE, this logic might not make perfect sense, but it avoids having prefixes that are ragged
+                # which is a better quality when measuring @ idx_T
+                # tokenize first because we can't rely on the length col existing
+                example = tokenize_only(
+                    example,
+                    input_col_name=text_col_name,
+                    hf_model_name=args.model_name_or_path,
+                    tokenizer=tokenizer,
+                    model_max_length=args.model_max_length,
+                )
+                raw_inputs = example.pop("input_ids")
+                if not (prefix_length <= raw_inputs.shape[1]):
+                    if args.verbose:
+                        print(
+                            f"Skipping prefix generation for col {text_col_name} because prefix_length"
+                            f" {prefix_length} is OOB for this example (orig length={raw_inputs.shape[1]})."
+                        )
+                    continue
+                # else slice the inputs to the prefix length
+                inputs = raw_inputs[:, : prefix_length + 1]
+                prefix_len = inputs.shape[1]
+                # decode the prefix
+                decoded_prefix = tokenizer.decode(inputs[0], skip_special_tokens=True)
+                # store the prefix and it's length
+                tokd_column_data.update(
+                    {
+                        f"{text_col_name}": decoded_prefix,
+                        f"{text_col_name}_length": prefix_len,
+                    }
+                )
+            except Exception as e:
+                if args.verbose:
+                    print(
+                        f"Failed to generate prefix of len {prefix_length} for example idx={example['idx']}\n"
+                        f"Should either be becuase the col doesnt exist, or the prefix is OOB for this col in this example."
+                    )
+                print(f"Exception: {e}")
+            if text_col_name not in tokd_column_data:
+                tokd_column_data.update({f"{text_col_name}": None, f"{text_col_name}_length": None})
+        # add the prefix_len to the example
+        # then add the prefixes to the example
+        example.update({"prefix_length": prefix_length})
+        example.update(tokd_column_data)
+        return example
+    # if max_prefix_length is not specified, use the max length for the gen table
+    if args.max_prefix_length is None:
+        # args.max_prefix_length = args.model_max_length
+        args.max_prefix_length = args.max_new_tokens
+    # get the maximum length out of the ["baseline_completion_length", "no_wm_output_length", "w_wm_output_length", "w_wm_output_attacked_length"]
+    # found in the gen table
+    max_gen_table_output_length = max(
+        [
+            ex["baseline_completion_length"]
+            for ex in gen_table_ds
+            if "baseline_completion_length" in ex
+        ]
+        + [ex["no_wm_output_length"] for ex in gen_table_ds if "no_wm_output_length" in ex]
+        + [ex["w_wm_output_length"] for ex in gen_table_ds if "w_wm_output_length" in ex]
+        + [
+            ex["w_wm_output_attacked_length"]
+            for ex in gen_table_ds
+            if "w_wm_output_attacked_length" in ex
+        ]
+    )
+    args.max_prefix_length = min(args.max_prefix_length, max_gen_table_output_length)
+    # round down to the nearest multiple of prefix_stride
+    last_multiple = args.max_prefix_length - (args.max_prefix_length % args.prefix_stride)
+    prefix_lengths = list(
+        range(args.prefix_stride, last_multiple + args.prefix_stride, args.prefix_stride)
+    )
+    # if missing the largest prefix length, add it
+    if prefix_lengths[-1] != args.max_prefix_length:
+        prefix_lengths.append(args.max_prefix_length)
+    if args.max_prefix_length > prefix_lengths[-1]:
+        print(
+            f"WARNING: max_prefix_length {args.max_prefix_length} is larger than the last prefix length {prefix_lengths[-1]} "
+            f"as computed by prefix_stride {args.prefix_stride} multiples up to the longest prefix length in the gen table: "
+            f"{max_gen_table_output_length}."
+        )
+    # store the prefix lengths
+    args.prefix_lengths = prefix_lengths
+    print(prefix_lengths)
+    ###########################################################################
+    # Create output dir if it doesn't exist, and warn if it contains metric file
+    # we do this here because we need the prefix list
+    ###########################################################################
+    # gen_table_prefixes_path = f"{args.output_dir}/gen_table_prefixes.jsonl"
+    # gen_table_prefixes_meta_path = f"{args.output_dir}/gen_table_prefixes_meta.json"
+    # making these the same as normal data so they can be used in the same way by eval
+    gen_table_prefixes_path = f"{args.output_dir}/gen_table.jsonl"
+    gen_table_prefixes_meta_path = f"{args.output_dir}/gen_table_meta.json"
+    if found_attacked_files:
+        gen_table_prefixes_path = f"{args.output_dir}/gen_table_attacked.jsonl"
+        gen_table_prefixes_meta_path = f"{args.output_dir}/gen_table_attacked_meta.json"
+    print(f"Output dir for this run: {args.output_dir}")
+    # notify if exists
+    if os.path.exists(args.output_dir):
+        print(f"Output dir for this run already exists!")
+        print(f"Contents: {sorted(os.listdir(args.output_dir))}")
+        # warn if metrics file exists
+        if args.save_per_prefix:
+            for prefix_len in prefix_lengths:
+                prefix_table_path = (
+                    f"{gen_table_prefixes_path.replace('.jsonl','')}_{prefix_len}.jsonl"
+                )
+                if os.path.exists(prefix_table_path):
+                    if not args.overwrite_output_file:
+                        print(
+                            f"WARNING: Exiting to avoid overwriting prefix output file. "
+                            f"Pass the '--overwrite_output_file' flag to ignore this check."
+                        )
+                        exit()
+                    else:
+                        print(
+                            f"WARNING: Found existing prefix files at this output dir. "
+                            f"Overwriting anyway :/"
+                        )
+        elif os.path.exists(gen_table_prefixes_path):
+            if not args.overwrite_output_file:
+                print(
+                    f"WARNING: Exiting to avoid overwriting prefix output file. "
+                    f"Pass the '--overwrite_output_file' flag to ignore this check."
+                )
+                exit()
+            else:
+                print(
+                    f"WARNING: Found existing prefix files at this output dir. "
+                    f"Overwriting anyway :/"
+                )
+    else:
+        # create the output dir where run artifacts are stored
+        os.makedirs(args.output_dir)
+    ###########################################################################
+    # Generate the prefixes
+    ###########################################################################
+    prefix_tables = []
+    gen_table_ds_lst = [ex for ex in gen_table_ds]
+    # hacky check to see whether were working with attacked files
+    text_col_names = ["baseline_completion", "no_wm_output", "w_wm_output"]
+    if "w_wm_output_attacked" in gen_table_ds_lst[0]:
+        assert found_attacked_files, (
+            f"found 'w_wm_output_attacked' in the gen table, but apparently we didn't 'load attacked files'?."
+            f"Odd... please check whats going on in the input_dir."
+        )
+        text_col_names.append("w_wm_output_attacked")
+    for prefix_len in tqdm(prefix_lengths):
+        prefixes_partial = partial(
+            generate_prefix,
+            prefix_length=prefix_len,
+            tokenizer=tokenizer,
+            text_col_names=text_col_names,
+        )
+        gen_table_prefixes = [prefixes_partial(ex) for ex in gen_table_ds_lst]
+        # add the prefix dataset to the list of prefix tables
+        prefix_tables.append(Dataset.from_list(gen_table_prefixes))
+    # now concat the tables
+    gen_table_prefixes = concatenate_datasets(prefix_tables)
+    ###########################################################################
+    # Write the metadata and final dataset out to disk in jsonl format
+    # (and optionally save the individual prefix shards)
+    ###########################################################################
+    # write the metadata
+    write_json(args.__dict__, gen_table_prefixes_meta_path, indent=4)
+    # write the dataset
+    if not args.save_per_prefix:
+        write_jsonlines(gen_table_prefixes, gen_table_prefixes_path)
+    else:
+        # save the individual prefix shards
+        for prefix_len in prefix_lengths:
+            prefix_table = gen_table_prefixes.filter(lambda ex: ex["prefix_length"] == prefix_len)
+            prefix_table_path = f"{gen_table_prefixes_path.replace('.jsonl','')}_{prefix_len}.jsonl"
+            write_jsonlines(prefix_table, prefix_table_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Transform jsonl datasets into a broadcasted prefix version."
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="use to load the tokenizer",
+    )
+    parser.add_argument(
+        "--prefix_stride",
+        type=int,
+        default=10,
+        help="The stride to use when generating prefixes.",
+    )
+    parser.add_argument(
+        "--max_prefix_length",
+        type=int,
+        default=None,
+        help="The maximum prefix length to use when generating prefixes.",
+    )
+    parser.add_argument(
+        "--model_max_length",
+        type=int,
+        default=2048,
+    )
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        default=None,
+        help="The directory containing the input files.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help=("The directory in which to write out the dataset after creating prefixes. "),
+    )
+    parser.add_argument(
+        "--save_per_prefix",
+        type=str2bool,
+        default=False,
+        help="Whether to save the individual shards of the dataset corresponding to each prefix length.",
+    )
+    parser.add_argument(
+        "--overwrite_output_file",
+        type=str2bool,
+        default=False,
+        help="Whether to overwrite the output file if it already exists.",
+    )
+    parser.add_argument(
+        "--limit_rows",
+        type=int,
+        default=None,
+        help="The number of rows to limit the dataset to. Useful for debugging.",
+    )
+    parser.add_argument(
+        "--verbose",
+        type=str2bool,
+        default=False,
+        help="Whether to print out the indexes for errors as the prefixes are generated.",
+    )
+    parser.add_argument(
+        "--filter_length",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--max_new_tokens",
+        type=int,
+        default=None,
+    )
+    args = parser.parse_args()
+    ###########################################################################
+    # Argument validation and conditional setting
+    ###########################################################################
+    # require output_dir to be specified and different from input_dir
+    assert args.input_dir is not None
+    assert args.output_dir is not None
+    assert args.input_dir != args.output_dir, "input_dir and output_dir must be different"
+    # check limit_rows
+    assert (args.limit_rows is None) or (
+        (args.limit_rows > 0) and isinstance(args.limit_rows, int)
+    ), "limit_rows must be > 0 or None"
+    # check prefix_stride
+    assert (args.prefix_stride > 0) and isinstance(
+        args.prefix_stride, int
+    ), "prefix_stride must be > 0"
+    main(args)

lm-watermarking-main/watermark_reliability_release/detectgpt/debug.sh ADDED Viewed

	@@ -0,0 +1,77 @@

+#!/bin/bash
+#SBATCH --partition tron
+#SBATCH --gres=gpu:rtxa6000:1
+#SBATCH --ntasks=4
+#SBATCH --mem=32G
+#SBATCH --account=nexus
+#SBATCH --qos=default
+#SBATCH --time=48:00:00
+#SBATCH --array=0-1
+#SBATCH --output=slurm_logs/%A_%a.out
+#SBATCH --job-name=run-detect
+source ~/.bashrc
+conda activate watermarking-dev
+OUTPUT_DIR=/cmlscratch/manlis/test/watermarking-root/input/new_runs
+# model_name="facebook/opt-1.3b"
+# data_path="/cmlscratch/manlis/test/watermarking-root/input/new_runs/test_len_200_opt1_3b_evaluation/gen_table_w_metrics.jsonl"
+model_name='facebook/opt-6.7b'
+data_path='/cmlscratch/manlis/test/watermarking-root/input/new_runs/test_len_1000_evaluation/gen_table_w_metrics.jsonl'
+mask_model="t5-3b"
+# token_len=200
+chunk_size=32
+pct=0.3
+split="no_wm"
+textlen=600
+# python detectgpt_main.py \
+#         --n_perturbation_list="10,100" \
+#         --do_chunk \
+#         --base_model_name=${model_name} \
+#         --mask_filling_model_name=${mask_model} \
+#         --data_path=/cmlscratch/manlis/test/watermarking-root/input/new_runs/test_len_${textlen}_evaluation/gen_table_w_metrics.jsonl \
+#         --token_len=${textlen} \
+#         --pct_words_masked=${pct} \
+#         --chunk_size=${chunk_size} \
+#         --data_split=${split};
+declare -a commands
+for textlen in 600 1000;
+do
+    commands+=( "python detectgpt_main.py \
+        --n_perturbation_list="10,100" \
+        --do_chunk \
+        --base_model_name=${model_name} \
+        --mask_filling_model_name=${mask_model} \
+        --data_path=/cmlscratch/manlis/test/watermarking-root/input/new_runs/test_len_${textlen}_evaluation/gen_table_w_metrics.jsonl \
+        --token_len=${textlen} \
+        --pct_words_masked=${pct} \
+        --chunk_size=${chunk_size} \
+        --data_split=${split};" )
+done
+bash -c "${commands[${SLURM_ARRAY_TASK_ID}]}"
+# --data_path=/cmlscratch/manlis/test/watermarking-root/input/new_runs/test_len_${textlen}_evaluation/gen_table_w_metrics.jsonl \

lm-watermarking-main/watermark_reliability_release/detectgpt/detectgpt_main.py ADDED Viewed

	@@ -0,0 +1,807 @@

+# Basic imports
+import os
+import argparse
+import re
+import functools
+from tqdm import tqdm
+from statistics import mean
+import numpy as np
+import pandas as pd
+import torch
+import matplotlib.pyplot as plt
+from matplotlib import rc
+rc("font", **{"family": "serif", "serif": ["Computer Modern"]})
+rc("text", usetex=True)
+from sklearn.metrics import roc_curve, precision_recall_curve, auc
+import cmasher as cmr
+# ### Load the processed dataset/frame
+import sys
+sys.path.insert(0, "..")
+from datasets import Dataset
+from utils.io import read_jsonlines, load_jsonlines
+import transformers
+# some file i/o helpers
+from utils.io import write_jsonlines, write_json
+INPUT_DIR = "/cmlscratch/manlis/test/watermarking-root/input"
+OUTPUT_DIR = "/cmlscratch/manlis/test/watermarking-root/output"
+# 15 colorblind-friendly colors
+COLORS = [
+    "#0072B2",
+    "#009E73",
+    "#D55E00",
+    "#CC79A7",
+    "#F0E442",
+    "#56B4E9",
+    "#E69F00",
+    "#000000",
+    "#0072B2",
+    "#009E73",
+    "#D55E00",
+    "#CC79A7",
+    "#F0E442",
+    "#56B4E9",
+    "#E69F00",
+]
+def tokenize_and_mask(
+    text, span_length, pct, ceil_pct=False, buffer_size=1, mask_string="<<<mask>>>"
+):
+    if isinstance(text, str):
+        tokens = text.split(" ")
+    else:
+        tokens = text
+    mask_string = mask_string
+    n_spans = pct * len(tokens) / (span_length + buffer_size * 2)
+    if ceil_pct:
+        n_spans = np.ceil(n_spans)
+    n_spans = int(n_spans)
+    n_masks = 0
+    while n_masks < n_spans:
+        start = np.random.randint(0, len(tokens) - span_length)
+        end = start + span_length
+        search_start = max(0, start - buffer_size)
+        search_end = min(len(tokens), end + buffer_size)
+        if mask_string not in tokens[search_start:search_end]:
+            tokens[start:end] = [mask_string]
+            n_masks += 1
+    # replace each occurrence of mask_string with <extra_id_NUM>, where NUM increments
+    num_filled = 0
+    for idx, token in enumerate(tokens):
+        if token == mask_string:
+            tokens[idx] = f"<extra_id_{num_filled}>"
+            num_filled += 1
+    assert num_filled == n_masks, f"num_filled {num_filled} != n_masks {n_masks}"
+    text = " ".join(tokens)
+    return text
+def tokenize_and_mask_glm(
+    text, span_length, pct, ceil_pct=False, buffer_size=1, mask_string="[MASK]"
+):
+    tokens = text.split(" ")
+    mask_string = mask_string
+    n_spans = pct * len(tokens) / (span_length + buffer_size * 2)
+    if ceil_pct:
+        n_spans = np.ceil(n_spans)
+    n_spans = int(n_spans)
+    n_masks = 0
+    while n_masks < n_spans:
+        start = np.random.randint(0, len(tokens) - span_length)
+        end = start + span_length
+        search_start = max(0, start - buffer_size)
+        search_end = min(len(tokens), end + buffer_size)
+        if mask_string not in tokens[search_start:search_end]:
+            tokens[start:end] = [mask_string]
+            n_masks += 1
+    text = " ".join(tokens)
+    return text
+def count_masks(texts):
+    return [len([x for x in text.split() if x.startswith("<extra_id_")]) for text in texts]
+# replace each masked span with a sample from T5 mask_model
+def replace_masks(texts):
+    n_expected = count_masks(texts)
+    stop_id = mask_tokenizer.encode(f"<extra_id_{max(n_expected)}>")[0]
+    tokens = mask_tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to("cuda")
+    outputs = mask_model.generate(
+        **tokens,
+        max_length=mask_tokenizer.model_max_length,
+        do_sample=True,
+        top_p=1.0,
+        num_return_sequences=1,
+        eos_token_id=stop_id,
+    )
+    # outputs = mask_model.generate(**tokens, max_length=mask_tokenizer.model_max_length, do_sample=True, top_p=1.0, num_return_sequences=1, eos_token_id=stop_id)
+    return mask_tokenizer.batch_decode(outputs, skip_special_tokens=False)
+def replace_masks_glm(texts):
+    # n_expected = [len([x for x in text.split() if x == '[MASK]']) for text in texts]
+    tokens = mask_tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
+    tokens = mask_tokenizer.build_inputs_for_generation(
+        tokens, max_gen_length=mask_tokenizer.model_max_length
+    ).to("cuda")
+    outputs = mask_model.generate(
+        **tokens,
+        max_length=mask_tokenizer.model_max_length,
+        do_sample=True,
+        top_p=1.0,
+        num_return_sequences=1,
+        eos_token_id=mask_tokenizer.eop_token_id,
+    )
+    return mask_tokenizer.batch_decode(outputs, skip_special_tokens=False)
+def extract_fills(texts):
+    # remove <pad> from beginning of each text
+    texts = [x.replace("<pad>", "").replace("</s>", "").strip() for x in texts]
+    # return the text in between each matched mask token
+    extracted_fills = [pattern.split(x)[1:-1] for x in texts]
+    # remove whitespace around each fill
+    extracted_fills = [[y.strip() for y in x] for x in extracted_fills]
+    return extracted_fills
+def apply_extracted_fills(masked_texts, extracted_fills):
+    # split masked text into tokens, only splitting on spaces (not newlines)
+    tokens = [x.split(" ") for x in masked_texts]
+    n_expected = count_masks(masked_texts)
+    # replace each mask token with the corresponding fill
+    for idx, (text, fills, n) in enumerate(zip(tokens, extracted_fills, n_expected)):
+        if len(fills) < n:
+            tokens[idx] = []
+        else:
+            for fill_idx in range(n):
+                text[text.index(f"<extra_id_{fill_idx}>")] = fills[fill_idx]
+    # join tokens back into text
+    texts = [" ".join(x) for x in tokens]
+    return texts
+def perturb_texts_(
+    texts, span_length, pct, ceil_pct=False, mask_filling_model_name="t5-3b", do_chunk=False
+):
+    if "t5" in mask_filling_model_name:
+        if do_chunk:
+            texts = [x.split(" ") for x in texts]
+            ## chunk long texts
+            if max([len(x) for x in texts]) > 600:
+                text_pieces = [
+                    [t[: len(t) // 3] for t in texts],
+                    [t[len(t) // 3 : 2 * len(t) // 3] for t in texts],
+                    [t[2 * len(t) // 3 :] for t in texts],
+                ]
+            else:
+                text_pieces = [[t[: len(t) // 2] for t in texts], [t[len(t) // 2 :] for t in texts]]
+            perturbed_pieces = []
+            for pieces in text_pieces:
+                masked_texts = [tokenize_and_mask(x, span_length, pct, ceil_pct) for x in pieces]
+                raw_fills = replace_masks(masked_texts)
+                extracted_fills = extract_fills(raw_fills)
+                perturbed_pieces.append(apply_extracted_fills(masked_texts, extracted_fills))
+            ## put the chunks together
+            perturbed_texts = []
+            for i in range(len(texts)):
+                perturbed_texts.append(" ".join([p[i] for p in perturbed_pieces]))
+        else:
+            masked_texts = [tokenize_and_mask(x, span_length, pct, ceil_pct) for x in texts]
+            raw_fills = replace_masks(masked_texts)
+            extracted_fills = extract_fills(raw_fills)
+            perturbed_texts = apply_extracted_fills(masked_texts, extracted_fills)
+    # elif 'glm' in mask_filling_model_name:
+    #     masked_texts = [tokenize_and_mask_glm(x, span_length, pct, ceil_pct) for x in texts]
+    #     raw_fills = replace_masks_glm(masked_texts)
+    #     extracted_fills = extract_fills(raw_fills)
+    #     perturbed_texts = apply_extracted_fills(masked_texts, extracted_fills)
+    # Handle the fact that sometimes the model doesn't generate the right number of fills and we have to try again
+    attempts = 1
+    while "" in perturbed_texts:
+        idxs = [idx for idx, x in enumerate(perturbed_texts) if x == ""]
+        print(f"WARNING: {len(idxs)} texts have no fills. Trying again [attempt {attempts}].")
+        if do_chunk:
+            new_perturbed_pieces = []
+            for pieces in text_pieces:
+                masked_texts = [
+                    tokenize_and_mask(x, span_length, pct, ceil_pct)
+                    for idx, x in enumerate(pieces)
+                    if idx in idxs
+                ]
+                raw_fills = replace_masks(masked_texts)
+                extracted_fills = extract_fills(raw_fills)
+                new_perturbed_pieces.append(apply_extracted_fills(masked_texts, extracted_fills))
+            new_perturbed_texts = []
+            for i in range(len(texts)):
+                new_perturbed_texts.append(" ".join([p[i] for p in new_perturbed_pieces]))
+        else:
+            masked_texts = [
+                tokenize_and_mask(x, span_length, pct, ceil_pct)
+                for idx, x in enumerate(texts)
+                if idx in idxs
+            ]
+            raw_fills = replace_masks(masked_texts)
+            extracted_fills = extract_fills(raw_fills)
+            new_perturbed_texts = apply_extracted_fills(masked_texts, extracted_fills)
+        for idx, x in zip(idxs, new_perturbed_texts):
+            perturbed_texts[idx] = x
+        attempts += 1
+    return perturbed_texts
+def perturb_texts(
+    texts, span_length, pct, mask_filling_model_name, ceil_pct=False, chunk_size=20, do_chunk=False
+):
+    chunk_size = chunk_size
+    if "11b" in mask_filling_model_name:
+        chunk_size //= 2
+    outputs = []
+    for i in tqdm(range(0, len(texts), chunk_size), desc="Applying perturbations"):
+        outputs.extend(
+            perturb_texts_(
+                texts[i : i + chunk_size],
+                span_length,
+                pct,
+                ceil_pct=ceil_pct,
+                mask_filling_model_name=mask_filling_model_name,
+                do_chunk=do_chunk,
+            )
+        )
+    return outputs
+# Get the log likelihood of each text under the base_model
+def get_ll(text):
+    with torch.no_grad():
+        tokenized = base_tokenizer(text, return_tensors="pt").to("cuda")
+        labels = tokenized.input_ids
+        return -base_model(**tokenized, labels=labels).loss.item()
+def get_lls(texts):
+    return [get_ll(text) for text in texts]
+def get_perturbation_results(
+    span_length=10,
+    chunk_size=50,
+    n_perturbations=1,
+    n_perturbation_rounds=1,
+    pct_words_masked=0.3,
+    data_split="wm",
+    mask_filling_model_name="t5-3b",
+    do_chunk=False,
+    save_path="/cmlscratch/manlis/test/watermarking-root/output/detect-gpt",
+):
+    ## check if pre-computed results exist
+    if os.path.isfile(os.path.join(save_path, f"perturbed_raw_texts_{n_perturbations}.jsonl")):
+        results = load_jsonlines(
+            os.path.join(save_path, f"perturbed_raw_texts_{n_perturbations}.jsonl")
+        )
+    else:
+        base_model.cpu()
+        mask_model.cuda()
+        torch.manual_seed(0)
+        np.random.seed(0)
+        results = []
+        original_text = df["baseline_completion"]
+        if data_split == "wm":
+            sampled_text = df["w_wm_output"]
+        elif data_split == "no_wm":
+            sampled_text = df["no_wm_output"]
+        elif data_split == "no_wm_paraphrase":
+            sampled_text = df["w_wm_output_attacked"]
+        else:
+            raise NotImplementedError(f"Unknown split: {data_split}")
+        perturb_fn = functools.partial(
+            perturb_texts,
+            span_length=span_length,
+            pct=pct_words_masked,
+            mask_filling_model_name=mask_filling_model_name,
+            chunk_size=chunk_size,
+            do_chunk=do_chunk,
+        )
+        p_sampled_text = perturb_fn([x for x in sampled_text for _ in range(n_perturbations)])
+        p_original_text = perturb_fn([x for x in original_text for _ in range(n_perturbations)])
+        for _ in range(n_perturbation_rounds - 1):
+            try:
+                p_sampled_text, p_original_text = perturb_fn(p_sampled_text), perturb_fn(
+                    p_original_text
+                )
+            except AssertionError:
+                break
+        assert (
+            len(p_sampled_text) == len(sampled_text) * n_perturbations
+        ), f"Expected {len(sampled_text) * n_perturbations} perturbed samples, got {len(p_sampled_text)}"
+        assert (
+            len(p_original_text) == len(original_text) * n_perturbations
+        ), f"Expected {len(original_text) * n_perturbations} perturbed samples, got {len(p_original_text)}"
+        for i, idx in enumerate(original_text.index):
+            results.append(
+                {
+                    "original": original_text[idx],
+                    "sampled": sampled_text[idx],
+                    "perturbed_sampled": p_sampled_text[
+                        i * n_perturbations : (i + 1) * n_perturbations
+                    ],
+                    "perturbed_original": p_original_text[
+                        i * n_perturbations : (i + 1) * n_perturbations
+                    ],
+                }
+            )
+        ## save perturbed samples in case job got preempted
+        write_jsonlines(
+            results, os.path.join(save_path, f"perturbed_raw_texts_{n_perturbations}.jsonl")
+        )
+    mask_model.cpu()
+    base_model.cuda()
+    for res in tqdm(results, desc="Computing log likelihoods"):
+        p_sampled_ll = get_lls(res["perturbed_sampled"])
+        p_original_ll = get_lls(res["perturbed_original"])
+        res["original_ll"] = get_ll(res["original"])
+        res["sampled_ll"] = get_ll(res["sampled"])
+        res["all_perturbed_sampled_ll"] = p_sampled_ll
+        res["all_perturbed_original_ll"] = p_original_ll
+        res["perturbed_sampled_ll"] = np.mean(p_sampled_ll)
+        res["perturbed_original_ll"] = np.mean(p_original_ll)
+        res["perturbed_sampled_ll_std"] = np.std(p_sampled_ll) if len(p_sampled_ll) > 1 else 1
+        res["perturbed_original_ll_std"] = np.std(p_original_ll) if len(p_original_ll) > 1 else 1
+    return results
+def get_roc_metrics(real_preds, sample_preds):
+    fpr, tpr, _ = roc_curve(
+        [0] * len(real_preds) + [1] * len(sample_preds), real_preds + sample_preds
+    )
+    roc_auc = auc(fpr, tpr)
+    return fpr.tolist(), tpr.tolist(), float(roc_auc)
+def get_precision_recall_metrics(real_preds, sample_preds):
+    precision, recall, _ = precision_recall_curve(
+        [0] * len(real_preds) + [1] * len(sample_preds), real_preds + sample_preds
+    )
+    pr_auc = auc(recall, precision)
+    return precision.tolist(), recall.tolist(), float(pr_auc)
+def run_perturbation_experiment(
+    results, criterion, span_length=10, n_perturbations=1, pct_words_masked=0.3, n_samples=500
+):
+    # compute diffs with perturbed
+    predictions = {"real": [], "samples": []}
+    for res in results:
+        if criterion == "d":
+            predictions["real"].append(res["original_ll"] - res["perturbed_original_ll"])
+            predictions["samples"].append(res["sampled_ll"] - res["perturbed_sampled_ll"])
+        elif criterion == "z":
+            if res["perturbed_original_ll_std"] == 0:
+                res["perturbed_original_ll_std"] = 1
+                print("WARNING: std of perturbed original is 0, setting to 1")
+                print(
+                    f"Number of unique perturbed original texts: {len(set(res['perturbed_original']))}"
+                )
+                print(f"Original text: {res['original']}")
+            if res["perturbed_sampled_ll_std"] == 0:
+                res["perturbed_sampled_ll_std"] = 1
+                print("WARNING: std of perturbed sampled is 0, setting to 1")
+                print(
+                    f"Number of unique perturbed sampled texts: {len(set(res['perturbed_sampled']))}"
+                )
+                print(f"Sampled text: {res['sampled']}")
+            predictions["real"].append(
+                (res["original_ll"] - res["perturbed_original_ll"])
+                / res["perturbed_original_ll_std"]
+            )
+            predictions["samples"].append(
+                (res["sampled_ll"] - res["perturbed_sampled_ll"]) / res["perturbed_sampled_ll_std"]
+            )
+    fpr, tpr, roc_auc = get_roc_metrics(predictions["real"], predictions["samples"])
+    p, r, pr_auc = get_precision_recall_metrics(predictions["real"], predictions["samples"])
+    name = f"perturbation_{n_perturbations}_{criterion}"
+    print(f"{name} ROC AUC: {roc_auc}, PR AUC: {pr_auc}")
+    return {
+        "name": name,
+        "predictions": predictions,
+        "info": {
+            "pct_words_masked": pct_words_masked,
+            "span_length": span_length,
+            "n_perturbations": n_perturbations,
+            "n_samples": n_samples,
+        },
+        "raw_results": results,
+        "metrics": {
+            "roc_auc": roc_auc,
+            "fpr": fpr,
+            "tpr": tpr,
+        },
+        "pr_metrics": {
+            "pr_auc": pr_auc,
+            "precision": p,
+            "recall": r,
+        },
+        "loss": 1 - pr_auc,
+    }
+## DetectGPT Running: get perturnation results
+import json
+# save the ROC curve for each experiment, given a list of output dictionaries, one for each experiment, using colorblind-friendly colors
+def save_roc_curves(experiments, save_folder, args):
+    # first, clear plt
+    plt.clf()
+    for experiment, color in zip(experiments, COLORS):
+        metrics = experiment["metrics"]
+        plt.plot(
+            metrics["fpr"],
+            metrics["tpr"],
+            label=f"{experiment['name']}, roc_auc={metrics['roc_auc']:.3f}",
+            color=color,
+        )
+        # print roc_auc for this experiment
+        print(f"{experiment['name']} roc_auc: {metrics['roc_auc']:.3f}")
+    plt.plot([0, 1], [0, 1], color="black", lw=2, linestyle="--")
+    plt.xlim([0.0, 1.0])
+    plt.ylim([0.0, 1.05])
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
+    plt.title(f"ROC Curves ({args.base_model_name} - {args.mask_filling_model_name})")
+    plt.legend(loc="lower right", fontsize=6)
+    plt.savefig(f"{save_folder}/roc_curves.png")
+def save_roc_curves_w_ztest(experiments, zscore_sample, zscore_original, save_folder, args):
+    # first, clear plt
+    plt.clf()
+    ## make ztest ROC curve
+    positive_preds = np.array(zscore_sample)
+    negative_preds = np.array(zscore_original)
+    positive_labels = np.ones_like(positive_preds, dtype=int)
+    negative_labels = np.zeros_like(negative_preds, dtype=int)
+    all_preds = np.concatenate((positive_preds, negative_preds))
+    all_labels = np.concatenate((positive_labels, negative_labels))
+    tpr_z, fpr_z, _ = roc_curve(all_labels, all_preds)
+    roc_auc_z = auc(tpr_z, fpr_z)
+    plt.plot(tpr_z, fpr_z, label=f"z-score test, roc_auc={roc_auc_z:.3f}")
+    print(f"ztest roc_auc: {roc_auc_z:.3f}")
+    for experiment, color in zip(experiments, COLORS):
+        metrics = experiment["metrics"]
+        plt.plot(
+            metrics["fpr"],
+            metrics["tpr"],
+            label=f"{experiment['name']}, roc_auc={metrics['roc_auc']:.3f}",
+            color=color,
+        )
+        # print roc_auc for this experiment
+        print(f"{experiment['name']} roc_auc: {metrics['roc_auc']:.3f}")
+    plt.plot([0, 1], [0, 1], color="black", lw=2, linestyle="--")
+    plt.xlim([0.0, 1.0])
+    plt.ylim([0.0, 1.05])
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
+    plt.title(f"ROC Curves ({args.base_model_name} - {args.mask_filling_model_name})")
+    plt.legend(loc="lower right", fontsize=6)
+    plt.savefig(f"{save_folder}/roc_curves_w_ztests.png")
+# save the histogram of log likelihoods in two side-by-side plots, one for real and real perturbed, and one for sampled and sampled perturbed
+def save_ll_histograms(experiments, save_folder):
+    # first, clear plt
+    plt.clf()
+    for experiment in experiments:
+        try:
+            results = experiment["raw_results"]
+            # plot histogram of sampled/perturbed sampled on left, original/perturbed original on right
+            plt.figure(figsize=(20, 6))
+            plt.subplot(1, 2, 1)
+            plt.hist([r["sampled_ll"] for r in results], alpha=0.5, bins="auto", label="sampled")
+            plt.hist(
+                [r["perturbed_sampled_ll"] for r in results],
+                alpha=0.5,
+                bins="auto",
+                label="perturbed sampled",
+            )
+            plt.xlabel("log likelihood")
+            plt.ylabel("count")
+            plt.legend(loc="upper right")
+            plt.subplot(1, 2, 2)
+            plt.hist([r["original_ll"] for r in results], alpha=0.5, bins="auto", label="original")
+            plt.hist(
+                [r["perturbed_original_ll"] for r in results],
+                alpha=0.5,
+                bins="auto",
+                label="perturbed original",
+            )
+            plt.xlabel("log likelihood")
+            plt.ylabel("count")
+            plt.legend(loc="upper right")
+            plt.savefig(f"{save_folder}/ll_histograms_{experiment['name']}.png")
+        except:
+            pass
+# save the histograms of log likelihood ratios in two side-by-side plots, one for real and real perturbed, and one for sampled and sampled perturbed
+def save_llr_histograms(experiments, save_folder):
+    # first, clear plt
+    plt.clf()
+    for experiment in experiments:
+        try:
+            results = experiment["raw_results"]
+            # plot histogram of sampled/perturbed sampled on left, original/perturbed original on right
+            plt.figure(figsize=(20, 6))
+            plt.subplot(1, 2, 1)
+            # compute the log likelihood ratio for each result
+            for r in results:
+                r["sampled_llr"] = r["sampled_ll"] - r["perturbed_sampled_ll"]
+                r["original_llr"] = r["original_ll"] - r["perturbed_original_ll"]
+            plt.hist([r["sampled_llr"] for r in results], alpha=0.5, bins="auto", label="sampled")
+            plt.hist([r["original_llr"] for r in results], alpha=0.5, bins="auto", label="original")
+            plt.xlabel("log likelihood ratio")
+            plt.ylabel("count")
+            plt.legend(loc="upper right")
+            plt.savefig(f"{save_folder}/llr_histograms_{experiment['name']}.png")
+        except:
+            pass
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Run detect-gpt with watermarked and baseline generations"
+    )
+    parser.add_argument(
+        "--base_model_name",
+        type=str,
+        default="facebook/opt-1.3b",
+        help="Main model, path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--token_len",
+        type=int,
+        default=200,
+    )
+    parser.add_argument(
+        "--n_samples",
+        type=int,
+        default=500,
+    )
+    parser.add_argument(
+        "--chunk_size",
+        type=int,
+        default=32,
+    )
+    parser.add_argument(
+        "--data_path",
+        type=str,
+    )
+    parser.add_argument("--data_split", type=str, default="wm")
+    parser.add_argument(
+        "--mask_filling_model_name",
+        type=str,
+        default="t5-3b",
+    )
+    parser.add_argument("--n_positions", type=int, default=512)
+    parser.add_argument(
+        "--pct_words_masked",
+        type=float,
+        default=0.3,
+    )
+    parser.add_argument(
+        "--do_chunk",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument("--filter", type=str, default=None)
+    parser.add_argument("--mask_top_p", type=float, default=1.0)
+    parser.add_argument("--n_perturbation_list", type=str, default="1,10,100")
+    parser.add_argument("--n_perturbation_rounds", type=int, default=1)
+    parser.add_argument("--span_length", type=int, default=2)
+    parser.add_argument("--buffer_size", type=int, default=1)
+    args = parser.parse_args()
+    if args.token_len > 300:
+        args.do_chunk = True
+    ## load data
+    list_of_dict = load_jsonlines(args.data_path)
+    raw_data = Dataset.from_list(list_of_dict)
+    df = raw_data.to_pandas()
+    ## drop samples that are too short
+    original_len = len(df)
+    print(f"Origianl #samples: {original_len}")
+    if args.filter == "length":
+        df = df[
+            (df["baseline_completion_length"] == args.token_len)
+            & (df["no_wm_output_length"] == args.token_len)
+            & (df["w_wm_output_length"] == args.token_len)
+        ]
+        print(f" after filtering token length: {len(df)}")
+    if args.filter == "null":
+        try:
+            df = df[
+                (df["w_wm_output_length"].notnull())
+                & (df["w_wm_output_attacked_length"].notnull())
+                & ~(df["w_wm_output_length"] == "")
+                & ~(df["w_wm_output_attacked_length"] == 0)
+            ]
+            print(f" after filtering token length: {len(df)}")
+        except:
+            print(
+                "failed to filter null entries, probably because the file does not contain column 'w_wm_output_attacked_length'. "
+            )
+    args.n_samples = len(df)
+    ## load models
+    int8_kwargs = {}
+    half_kwargs = {}
+    if (
+        "glm" not in args.mask_filling_model_name
+    ):  # GLM uses an OP that's not supported in BFloat16: "triu_tril_cuda_template" not implemented for 'BFloat16'
+        half_kwargs = dict(torch_dtype=torch.bfloat16)
+    else:
+        half_kwargs = dict(torch_dtype=torch.float16)
+    ## load the base model (for generation) and base tokenizer
+    optional_tok_kwargs = {}
+    if "facebook/opt-" in args.base_model_name:
+        print("Using non-fast tokenizer for OPT")
+        optional_tok_kwargs["fast"] = False
+    base_model = transformers.AutoModelForCausalLM.from_pretrained(
+        args.base_model_name, **half_kwargs
+    )
+    base_model.eval()
+    ####### load base tokenizer ########
+    if "llama" in args.base_model_name:
+        from transformers import LlamaTokenizer
+        base_tokenizer = LlamaTokenizer.from_pretrained(
+            args.base_model_name, padding_side="left", **optional_tok_kwargs
+        )
+    else:
+        base_tokenizer = transformers.AutoTokenizer.from_pretrained(
+            args.base_model_name, padding_side="left", **optional_tok_kwargs
+        )
+    base_tokenizer.pad_token_id = base_tokenizer.eos_token_id
+    print(f"Loading mask filling model {args.mask_filling_model_name}...")
+    mask_model = transformers.AutoModelForSeq2SeqLM.from_pretrained(
+        args.mask_filling_model_name,
+        **int8_kwargs,
+        **half_kwargs,
+        trust_remote_code="glm" in args.mask_filling_model_name,
+    )
+    mask_model.eval()
+    ## mask model max length
+    try:
+        if "glm" in args.mask_filling_model_name:
+            n_positions = mask_model.config.max_sequence_length
+        else:
+            n_positions = mask_model.config.n_positions
+    except AttributeError:
+        n_positions = 512
+    # if n_positions < args.token_len:
+    # raise ValueError(f"Mask model cannot handle input longer then {n_positions}. Input token length: {args.token_len}")
+    # preproc_tokenizer = transformers.AutoTokenizer.from_pretrained('t5-small', model_max_length=n_positions)
+    mask_tokenizer = transformers.AutoTokenizer.from_pretrained(
+        args.mask_filling_model_name,
+        model_max_length=n_positions,
+        trust_remote_code="glm" in args.mask_filling_model_name,
+    )
+    mask_model.cpu()
+    # perturbing text ops
+    # define regex to match all <extra_id_*> tokens, where * is an integer
+    pattern = re.compile(r"<extra_id_\d+>")
+    SAVE_FOLDER = f'{OUTPUT_DIR}/detect-gpt/{os.path.basename(os.path.dirname(args.data_path))}-{args.data_split}-mask{args.mask_filling_model_name}/maskpct{args.pct_words_masked}-{os.path.basename(args.data_path).split(".")[0]}-ns{args.n_samples}'
+    os.makedirs(SAVE_FOLDER, exist_ok=True)
+    outputs = []
+    n_perturbation_list = [int(x) for x in args.n_perturbation_list.split(",")]
+    for n_perturbations in n_perturbation_list:
+        perturbation_results = get_perturbation_results(
+            args.span_length,
+            args.chunk_size,
+            n_perturbations,
+            args.n_perturbation_rounds,
+            args.pct_words_masked,
+            args.data_split,
+            args.mask_filling_model_name,
+            args.do_chunk,
+            save_path=SAVE_FOLDER,
+        )
+        for perturbation_mode in ["d", "z"]:
+            output = run_perturbation_experiment(
+                perturbation_results,
+                perturbation_mode,
+                span_length=args.span_length,
+                n_perturbations=n_perturbations,
+                pct_words_masked=args.pct_words_masked,
+                n_samples=args.n_samples,
+            )
+            outputs.append(output)
+            ## write columns to the input df
+            df[
+                f"baseline_completion_detectgpt_score_{n_perturbations}_{perturbation_mode}"
+            ] = output["predictions"]["real"]
+            df[f"no_wm_output_detectgpt_score_{n_perturbations}_{perturbation_mode}"] = output[
+                "predictions"
+            ]["samples"]
+            with open(
+                os.path.join(
+                    SAVE_FOLDER, f"perturbation_{n_perturbations}_{perturbation_mode}_results.json"
+                ),
+                "w",
+            ) as f:
+                json.dump(output, f)
+    ## save the updated input df
+    with open(os.path.join(SAVE_FOLDER, os.path.basename(args.data_path)), "w") as f:
+        print(df.to_json(orient="records", lines=True), file=f, flush=False, end="")
+    ## save meta file
+    gen_table_meta = args.__dict__
+    write_json(gen_table_meta, os.path.join(SAVE_FOLDER, "gen_table_meta.json"), indent=4)
+    ### plot curves and histograms
+    save_roc_curves(outputs, SAVE_FOLDER, args)
+    # save_roc_curves_w_ztest(outputs, df["w_wm_output_z_score"],
+    #                         df["baseline_completion_z_score"], SAVE_FOLDER, args)
+    save_ll_histograms(outputs, SAVE_FOLDER)
+    save_llr_histograms(outputs, SAVE_FOLDER)

lm-watermarking-main/watermark_reliability_release/detectgpt/make_plot.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# Basic imports
+import os
+import argparse
+import json
+import re
+import matplotlib.pyplot as plt
+from matplotlib import rc
+rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
+rc('text', usetex=True)
+from sklearn.metrics import roc_curve, precision_recall_curve, auc
+import sys
+sys.path.insert(0, "..")
+from datasets import Dataset
+from utils.io import read_jsonlines, load_jsonlines
+import transformers
+from detectgpt_main import save_roc_curves_w_ztest
+INPUT_DIR = "/cmlscratch/manlis/test/watermarking-root/input"
+OUTPUT_DIR = "/cmlscratch/manlis/test/watermarking-root/output"
+# 15 colorblind-friendly colors
+COLORS = ["#0072B2", "#009E73", "#D55E00", "#CC79A7", "#F0E442",
+                "#56B4E9", "#E69F00", "#000000", "#0072B2", "#009E73",
+                "#D55E00", "#CC79A7", "#F0E442", "#56B4E9", "#E69F00"]
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(
+        description="Run detect-gpt with watermarked and baseline generations"
+    )
+    parser.add_argument(
+        "--base_model_name",
+        type=str,
+        default="facebook/opt-1.3b",
+        help="Main model, path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--data_name",
+        type=str,
+    )
+    parser.add_argument(
+        "--token_len",
+        type=int,
+        default=200,
+    )
+    parser.add_argument(
+        "--n_samples",
+        type=int,
+        default=500,
+    )
+    parser.add_argument(
+        "--chunk_size",
+        type=int,
+        default=500,
+    )
+    parser.add_argument(
+        "--data_path",
+        type=str,
+    )
+    parser.add_argument(
+        "--data_split",
+        type=str,
+        default="wm"
+    )
+    parser.add_argument(
+        "--mask_filling_model_name",
+        type=str,
+        default="t5-3b",
+    )
+    parser.add_argument('--n_positions', type=int, default=512)
+    parser.add_argument(
+        "--pct_words_masked",
+        type=float,
+        default=0.3,
+    )
+    parser.add_argument('--mask_top_p', type=float, default=1.0)
+    parser.add_argument('--n_perturbation_list', type=str, default="1,10,100")
+    parser.add_argument('--n_perturbation_rounds', type=int, default=1)
+    parser.add_argument('--span_length', type=int, default=2)
+    parser.add_argument('--buffer_size', type=int, default=1)
+    args = parser.parse_args()
+    ## load data
+    list_of_dict = load_jsonlines(args.data_path)
+    raw_data = Dataset.from_list(list_of_dict)
+    df = raw_data.to_pandas()
+    ## drop samples that are too short
+    original_len = len(df)
+    df = df[(df["baseline_completion_length"] == args.token_len) \
+            & (df["no_wm_num_tokens_generated"] == args.token_len) \
+            & (df["w_wm_num_tokens_generated"] == args.token_len) ]
+    print(f"Origianl #samples: {original_len}, after filtering token length: {len(df)}")
+    args.n_samples = len(df)
+    # perturbing text ops
+    # define regex to match all <extra_id_*> tokens, where * is an integer
+    pattern = re.compile(r"<extra_id_\d+>")
+    SAVE_FOLDER = f'{OUTPUT_DIR}/detect-gpt/{args.data_name}-{args.data_split}-mask{args.mask_filling_model_name}/maskpct{args.pct_words_masked}-ns{args.n_samples}'
+    os.makedirs(SAVE_FOLDER, exist_ok=True)
+    outputs = []
+    n_perturbation_list = [int(x) for x in args.n_perturbation_list.split(",")]
+    for n_perturbations in n_perturbation_list:
+        for perturbation_mode in ['d', 'z']:
+            with open(os.path.join(SAVE_FOLDER, f"perturbation_{n_perturbations}_{perturbation_mode}_results.json"), "r") as f:
+                output = json.load(f)
+                outputs.append(output)
+    ### plot curves and histograms
+    save_roc_curves_w_ztest(outputs, df["w_wm_output_z_score"],
+                            df["baseline_completion_z_score"], SAVE_FOLDER, args,
+                            )

lm-watermarking-main/watermark_reliability_release/detectgpt/plot.sh ADDED Viewed

	@@ -0,0 +1,46 @@

+#!/bin/bash
+#SBATCH --partition tron
+#SBATCH --gres=gpu:rtxa5000:1
+#SBATCH --ntasks=4
+#SBATCH --mem=32G
+#SBATCH --account=nexus
+#SBATCH --qos=default
+#SBATCH --time=48:00:00
+#SBATCH --array=0
+#SBATCH --output=slurm_logs/%A_%a.out
+#SBATCH --job-name=gen-small
+source ~/.bashrc
+conda activate watermarking-dev
+OUTPUT_DIR=/cmlscratch/manlis/test/watermarking-root/input/new_runs
+model_name="facebook/opt-1.3b"
+data_name="test_len_200_opt1_3b"
+token_len=200
+chunk_size=32
+data_path="/cmlscratch/manlis/test/watermarking-root/input/new_runs/test_len_200_opt1_3b_evaluation/gen_table_w_metrics.jsonl"
+split="no_wm"
+python make_plot.py \
+    --n_perturbation_list="1,10,100" \
+    --base_model_name=${model_name} \
+    --data_name=${data_name} \
+    --data_path=${data_path} \
+    --token_len=${token_len} \
+    --chunk_size=${chunk_size} \
+    --data_split=${split};

lm-watermarking-main/watermark_reliability_release/detectgpt/run_detectgpt.sh ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/bin/bash
+#SBATCH --partition scavenger
+#SBATCH --gres=gpu:rtxa6000:1
+#SBATCH --ntasks=4
+#SBATCH --mem=32G
+#SBATCH --account=scavenger
+#SBATCH --qos=scavenger
+#SBATCH --time=24:00:00
+#SBATCH --array=0-2
+#SBATCH --output=slurm_logs/no_wm_attack_%A_%a.out
+#SBATCH --job-name=run-detect
+source ~/.bashrc
+conda activate watermarking-dev
+# OUTPUT_DIR=/cmlscratch/manlis/test/watermarking-root/input/new_runs
+# model_name="facebook/opt-1.3b"
+# data_path="/cmlscratch/manlis/test/watermarking-root/input/new_runs/test_len_200_opt1_3b_evaluation/gen_table_w_metrics.jsonl"
+# model_name='facebook/opt-6.7b'
+# data_path='/cmlscratch/manlis/test/watermarking-root/input/core_simple_1_50_200_gen/gen_table.jsonl'
+# data_path=input/core_simple_1_200_1000_gen_prefixes/gen_table_prefixes_200.jsonl
+model_name='/cmlscratch/manlis/test/watermarking-root/local_model/llama-7b-base'
+mask_model="t5-3b"
+# token_len=200
+chunk_size=32 # can run 32 when textlen=200
+pct=0.3
+# split="no_wm"
+split='no_wm_paraphrase'
+declare -a commands
+# for textlen in 50 100 200;
+for textlen in 50 100 200;
+do
+    commands+=( "python detectgpt_main.py \
+        --n_perturbation_list='100' \
+        --do_chunk \
+        --base_model_name=${model_name} \
+        --mask_filling_model_name=${mask_model} \
+        --filter='null' \
+        --data_path=/cmlscratch/manlis/test/watermarking-root/input/core_simple_1_200_1000_no_wm_gpt_p4_prefixes/gen_table_prefixes_${textlen}.jsonl \
+        --token_len=${textlen} \
+        --pct_words_masked=${pct} \
+        --chunk_size=${chunk_size} \
+        --data_split=${split};" )
+done
+bash -c "${commands[${SLURM_ARRAY_TASK_ID}]}"

lm-watermarking-main/watermark_reliability_release/evaluation_pipeline.py ADDED Viewed

	@@ -0,0 +1,1330 @@

+# coding=utf-8
+# Copyright 2023 Authors of "A Watermark for Large Language Models"
+# available at https://arxiv.org/abs/2301.10226
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from types import NoneType
+from typing import Union
+import os
+import argparse
+from functools import partial
+from tqdm import tqdm
+import wandb
+import torch
+import numpy as np
+import sklearn.metrics as metrics
+from datasets import Dataset, Sequence
+from transformers import DataCollatorWithPadding
+from utils.submitit import str2bool  # better bool flag type for argparse
+from utils.io import read_jsonlines, read_json, write_json, write_jsonlines
+from utils.notebooks import filter_text_col_length, infer_length_column
+from utils.evaluation import (
+    SUPPORTED_METRICS,
+    NO_CHECK_ARGS,
+    ROC_TEST_STAT_SUFFIXES,
+    FILTER_BY_COLUMNS,
+    conditional_no_check_args,
+    load_oracle_model,
+    evaluate_ppl,
+    load_detector,
+    compute_z_scores,
+    compute_windowed_z_scores,
+    compute_run_len_chsqrd_stats,
+    compute_repetition_diversity,
+    compute_p_sp,
+    compute_coherence,
+    compute_mauve,
+    compute_detect_retrieval,
+    load_tokenizer,
+    concat_rows,
+)
+print(f"Current huggingface cache dir: {os.environ['HF_HOME']}")
+from datasets import disable_caching
+disable_caching()
+def main(args):
+    ###########################################################################
+    # Create output dir if it doesn't exist, and warn if it contains metric file
+    ###########################################################################
+    gen_table_w_metrics_path = f"{args.output_dir}/gen_table_w_metrics.jsonl"
+    metrics_meta_path = f"{args.output_dir}/gen_table_w_metrics_meta.json"
+    print(f"Output dir for this run: {args.output_dir}")
+    # notify if exists
+    if os.path.exists(args.output_dir):
+        print(f"Output dir for this run already exists!")
+        print(f"Contents: {sorted(os.listdir(args.output_dir))}")
+        # warn if metrics file exists
+        if os.path.exists(gen_table_w_metrics_path):
+            if not args.overwrite_output_file:
+                print(
+                    f"WARNING: Exiting to avoid overwriting output file. "
+                    f"Pass the '--overwrite_output_file' flag to ignore this check."
+                )
+                exit()
+            else:
+                print(
+                    f"WARNING: Found existing generation files with metrics added at this output dir. "
+                    f"Overwriting anyway :/"
+                )
+    else:
+        # create the output dir where run artifacts are stored
+        os.makedirs(args.output_dir)
+    ###########################################################################
+    # Parse metrics to log - ppl, zscore, etc
+    ###########################################################################
+    # check that all metrics are supported
+    metric_support = [metric in SUPPORTED_METRICS for metric in args.evaluation_metrics]
+    assert all(metric_support), (
+        f"Unsupported metric '{args.evaluation_metrics[metric_support.index(False)]}' in"
+        f" {args.evaluation_metrics}. Supported metrics are: {SUPPORTED_METRICS}"
+    )
+    # Hack check that if prefix_lengths exists then the method must be
+    # detect-retrieval (for now) because other methods don't support the
+    # sparse dataset with Nones all over the place
+    if "prefix_lengths" in args.__dict__:
+        # assert args.evaluation_metrics == [
+        #     "detect-retrieval"
+        # ], f"Currently, only the detect-retrieval metric supports the prefix_lengths column. "
+        print(
+            f"WARNING: Found prefix_lengths column assuming that this is either retireval or detectgpt"
+        )
+    print(f"Evaluation metrics to compute: {args.evaluation_metrics}")
+    ###########################################################################
+    # Load generations
+    ###########################################################################
+    print(f"Input dir for this run: {args.input_dir}")
+    print(f"Loading previously generated outputs for evaluation via oracle model and metrics...")
+    # check for the "attacked version" of the gen table first
+    gen_table_meta_path = f"{args.input_dir}/gen_table_attacked_meta.json"
+    gen_table_path = f"{args.input_dir}/gen_table_attacked.jsonl"
+    safe_gen_table_path = f"{args.input_dir}/gen_table_attacked_safe.jsonl"
+    loaded_attacked = True
+    attack_variants_exist = [
+        os.path.exists(gen_table_meta_path),
+        os.path.exists(gen_table_path),
+    ]
+    if not all(attack_variants_exist):
+        loaded_attacked = False
+        gen_table_meta_path = f"{args.input_dir}/gen_table_meta.json"
+        gen_table_path = f"{args.input_dir}/gen_table.jsonl"
+        safe_gen_table_path = f"{args.input_dir}/gen_table_safe.jsonl"
+        assert os.path.exists(
+            gen_table_meta_path
+        ), f"failed file check for prev generations metadata json file: {gen_table_meta_path}"
+        assert os.path.exists(
+            gen_table_path
+        ), f"failed file check for prev generations jsonl file: {gen_table_path}"
+    assert not os.path.exists(safe_gen_table_path), (
+        f"failed for safety bc there is a secondary 'safe' marked file",
+        f" in this dir indicating a possible issue with the generation step. ",
+    )
+    cmdline_args = args.__dict__.copy()
+    prev_gen_table_meta = read_json(gen_table_meta_path)
+    joined_args = prev_gen_table_meta.copy()
+    for k, v in cmdline_args.items():
+        if v is not None:
+            joined_args.update({k: v})
+        else:
+            print(
+                f"cmdline arg {k} is None, leaving it as the value found in the input metadata: {prev_gen_table_meta[k]}"
+            )
+    # check that the args used to generate the prev generations are the same as
+    # the current args, for the intersection of keys
+    if not args.overwrite_args:
+        # update the no check args based on the current state of args
+        current_no_check_args = conditional_no_check_args(
+            NO_CHECK_ARGS, args.evaluation_metrics, args
+        )
+        for key in prev_gen_table_meta.keys():
+            if key in current_no_check_args:
+                continue
+            assert joined_args[key] == prev_gen_table_meta[key], (
+                f"failed for safety bc after merging the prev metadata with "
+                f"the current cmdline args, values for '{key}' are not the same. "
+                f"in metadata: {prev_gen_table_meta[key]}, passed: {cmdline_args[key]}. "
+                f"Pass the '--overwrite_args' flag to ignore this check."
+            )
+    args = argparse.Namespace(**joined_args)
+    gen_table = [ex for ex in read_jsonlines(gen_table_path)]
+    if args.limit_rows == -1:
+        gen_table_ds = Dataset.from_list(gen_table)
+    else:
+        gen_table_ds = Dataset.from_list(gen_table[: args.limit_rows])
+    ###########################################################################
+    # Extract the seeding scheme fine grained parameters
+    ###########################################################################
+    from utils.evaluation import scheme_hparam_extractor
+    args.__dict__.update(scheme_hparam_extractor(args.seeding_scheme))
+    print(f"seeding_scheme: {args.seeding_scheme}")
+    print(f"prf_type: {args.prf_type}")
+    print(f"anchored: {args.anchored}")
+    print(f"context_width: {args.context_width}")
+    print(f"self_salt: {args.self_salt}")
+    ###########################################################################
+    # Concat logic for multiple generations
+    ###########################################################################
+    if args.concat_rows != 0:
+        assert isinstance(args.concat_rows, int), f"Invalid concat_rows arg: {args.concat_rows}. "
+        # set to all rows if -1
+        if args.concat_rows == -1:
+            args.concat_rows = len(gen_table_ds)
+        if args.shuffle_before_concat:
+            print(f"Shuffling the gen table before concatenating every {args.concat_rows} rows...")
+            gen_table_ds = gen_table_ds.shuffle()
+        print(f"Concatenating every {args.concat_rows} rows of the gen table...")
+        # we concat all cols in OUTPUT_TEXT_COLUMN_NAMES
+        # and update the length col to reflect the new length
+        # which means we need to tokenize the new text temporarily
+        # to get the new length
+        tokenizer = load_tokenizer(args)
+        concat_partial = partial(concat_rows, tokenizer=tokenizer, args=args)
+        # manually write a btach loop bc hf doesnt support returning fewer rows than input
+        concatenated_rows = []
+        for i in tqdm(range(0, len(gen_table_ds), args.concat_rows)):
+            batch = gen_table_ds[i : i + args.concat_rows]
+            concatenated_rows.append(concat_partial(batch))
+        gen_table_concated_ds = Dataset.from_list(concatenated_rows)
+        # overwrite the args.max_new_tokens to reflect the implicit new target length T
+        # which is concat_rows * max_new_tokens
+        args.max_new_tokens = args.concat_rows * args.max_new_tokens
+        # write the dataset out in the same filename as the original
+        # but check that the input dir is different from the output dir
+        assert (
+            args.input_dir != args.output_dir
+        ), f"Input dir and output dir must be different to write out the result of concat rows."
+        if loaded_attacked:
+            concat_meta_path = f"{args.output_dir}/gen_table_attacked_meta.json"
+            concat_gen_table_path = f"{args.output_dir}/gen_table_attacked.jsonl"
+        else:
+            concat_meta_path = f"{args.output_dir}/gen_table_meta.json"
+            concat_gen_table_path = f"{args.output_dir}/gen_table.jsonl"
+        write_json(args.__dict__, concat_meta_path, indent=4)
+        gen_table_concated_lst = [ex for ex in gen_table_concated_ds]
+        write_jsonlines(gen_table_concated_lst, concat_gen_table_path)
+    else:
+        gen_table_concated_ds = gen_table_ds
+    ###########################################################################
+    # Additional args setup
+    ###########################################################################
+    # if target_T is not specified, use max_new_tokens (which will be in the reloaded gen metadata)
+    # and potentially overwritten by the concat logic above
+    if args.target_T == 0:
+        args.target_T = args.max_new_tokens
+    # storing slurm info to allow auditing logfiles
+    # note this is set after the metadata check to ignore overwriting
+    args.SLURM_JOB_ID = os.getenv("SLURM_JOB_ID")
+    args.SLURM_ARRAY_JOB_ID = os.getenv("SLURM_ARRAY_JOB_ID")
+    args.SLURM_ARRAY_TASK_ID = os.getenv("SLURM_ARRAY_TASK_ID")
+    ###########################################################################
+    # Start logging, we wait to do this until after loading the generations
+    # so that we can log the args used to generate them unioned with the
+    # cmdline args
+    ###########################################################################
+    if args.wandb:
+        # start a new wandb run to track this experiment, will send data to it
+        run = wandb.init(
+            # set the wandb project where this run will be logged
+            project=args.wandb_project,
+            entity=args.wandb_entity,
+            name=f"{args.run_name}",
+            # track hyperparameters and run metadata
+            config=args,
+            tags=args.wandb_tags,
+        )
+    ###########################################################################
+    # Perplexity (PPL) evaluation
+    # NOTE: basically requires a model on gpu, or is extremely slow
+    ###########################################################################
+    if "ppl" in args.evaluation_metrics:
+        assert args.oracle_model_name_or_path, "PPL metric requires oracle model."
+        # Load the oracle model for PPL measurement
+        oracle_model, oracle_tokenizer, _ = load_oracle_model(args)
+        # construct the collator
+        data_collator = DataCollatorWithPadding(
+            tokenizer=oracle_tokenizer, padding=True, pad_to_multiple_of=8
+        )
+        # construct fluency/ppl partial
+        evaluate_ppl_partial = partial(
+            evaluate_ppl,
+            oracle_model_name=args.oracle_model_name_or_path,
+            oracle_model=oracle_model,
+            oracle_tokenizer=oracle_tokenizer,
+            data_collator=data_collator,
+        )
+        print(f"Computing metrics on model generations: {gen_table_concated_ds}")
+        gen_table_w_ppl_ds = gen_table_concated_ds.map(
+            evaluate_ppl_partial,
+            batched=True,
+            batch_size=args.ppl_batch_size,
+            load_from_cache_file=False,
+            keep_in_memory=True,
+        )
+        # clear the model just for fun
+        oracle_model = oracle_model.to(torch.device("cpu"))
+        del oracle_model
+    else:
+        gen_table_w_ppl_ds = gen_table_concated_ds
+    ###########################################################################
+    # Cheap to load, and required for all detectors so load it first
+    watermark_detector = load_detector(args)
+    # Map setup for all dataset operations:
+    map_setup = dict(batched=False, load_from_cache_file=False)
+    ###########################################################################
+    # z-score evaluation
+    # NOTE: requires a gpu because if original source of watermark randomness,
+    # RNG, is gpu based, then detector should be on gpu as well
+    ###########################################################################
+    if "z-score" in args.evaluation_metrics:
+        # set up the partial
+        compute_z_scores_partial = partial(
+            compute_z_scores,
+            watermark_detector=watermark_detector,
+            args=args,
+        )
+        gen_table_w_zscore_ds = gen_table_w_ppl_ds.map(
+            compute_z_scores_partial, **map_setup, desc="Computing z-scores"
+        )
+    else:
+        gen_table_w_zscore_ds = gen_table_w_ppl_ds
+    ###########################################################################
+    # Windowed z-score evaluation
+    ###########################################################################
+    if "windowed-z-score" in args.evaluation_metrics:
+        # set up the windowed partial
+        compute_windowed_z_scores_partial = partial(
+            compute_windowed_z_scores,
+            watermark_detector=watermark_detector,
+            args=args,
+        )
+        gen_table_w_windowed_zscore_ds = gen_table_w_zscore_ds.map(
+            compute_windowed_z_scores_partial, **map_setup, desc="Computing windowed z-scores"
+        )
+    else:
+        gen_table_w_windowed_zscore_ds = gen_table_w_zscore_ds
+    ###########################################################################
+    # run-len-chisqrd evaluation
+    ###########################################################################
+    if "run-len-chisqrd" in args.evaluation_metrics:
+        assert "w_wm_output_green_token_mask" in gen_table_w_windowed_zscore_ds.column_names, (
+            f"Currently, run-len-chisqrd metric requires the green token masks to be computed previously "
+            f"by one of the z-score metrics."
+        )
+        # this ^ is unused currently, but we will need it to remove the assert condition above
+        # set up the run len chisqrd partial
+        compute_run_len_chisqrd_partial = partial(
+            compute_run_len_chsqrd_stats,
+            watermark_detector=watermark_detector,
+            args=args,
+        )
+        gen_table_w_run_len_chisqrd_ds = gen_table_w_windowed_zscore_ds.map(
+            compute_run_len_chisqrd_partial, **map_setup, desc="Computing runlength tests"
+        )
+    else:
+        gen_table_w_run_len_chisqrd_ds = gen_table_w_windowed_zscore_ds
+    ###########################################################################
+    # Diversity and Repetition evaluation
+    ###########################################################################
+    if "repetition" in args.evaluation_metrics or "diversity" in args.evaluation_metrics:
+        # set up the partial
+        compute_repetition_partial = partial(
+            compute_repetition_diversity,
+            include_repetition=("repetition" in args.evaluation_metrics),
+            include_diversity=("diversity" in args.evaluation_metrics),
+        )
+        gen_table_w_repetition_ds = gen_table_w_run_len_chisqrd_ds.map(
+            compute_repetition_partial, **map_setup, desc="Computing text repetition and diversity"
+        )
+    else:
+        gen_table_w_repetition_ds = gen_table_w_run_len_chisqrd_ds
+    ###########################################################################
+    # P-SP evaluation
+    ###########################################################################
+    if "p-sp" in args.evaluation_metrics:
+        print(f"Loading the P-SP model and computing P-SP")
+        gen_table_w_p_sp_ds = compute_p_sp(gen_table_w_repetition_ds)
+    else:
+        gen_table_w_p_sp_ds = gen_table_w_repetition_ds
+    ###########################################################################
+    # Coherence evaluation
+    ###########################################################################
+    if "coherence" in args.evaluation_metrics:
+        print(f"Computing coherence")
+        gen_table_w_coherence_ds = compute_coherence(gen_table_w_p_sp_ds)
+    else:
+        gen_table_w_coherence_ds = gen_table_w_p_sp_ds
+    ###########################################################################
+    # Mauve evaluation
+    ###########################################################################
+    if "mauve" in args.evaluation_metrics:
+        print(f"Computing mauve")
+        gen_table_w_mauve_ds = compute_mauve(gen_table_w_coherence_ds)
+    else:
+        gen_table_w_mauve_ds = gen_table_w_coherence_ds
+    ###########################################################################
+    # Retrieval detection
+    ###########################################################################
+    if "detect-retrieval" in args.evaluation_metrics:
+        print(f"Computing detect retrieval")
+        gen_table_w_detect_retrieval_ds = compute_detect_retrieval(gen_table_w_mauve_ds, args=args)
+    else:
+        gen_table_w_detect_retrieval_ds = gen_table_w_mauve_ds
+    if "prefix_length" in gen_table_w_detect_retrieval_ds.features:
+        if "no_wm_output_retrieval_score" in gen_table_w_detect_retrieval_ds.features:
+            print("Avg scores at each prefix length for no_wm_output:")
+            print(
+                gen_table_w_detect_retrieval_ds.to_pandas()
+                .groupby("prefix_length")["no_wm_output_retrieval_score"]
+                .describe()
+            )
+        if "w_wm_output_retrieval_score" in gen_table_w_detect_retrieval_ds.features:
+            print("Avg scores at each prefix length for w_wm_output:")
+            print(
+                gen_table_w_detect_retrieval_ds.to_pandas()
+                .groupby("prefix_length")["w_wm_output_retrieval_score"]
+                .describe()
+            )
+        if "w_wm_output_attacked_retrieval_score" in gen_table_w_detect_retrieval_ds.features:
+            print("Avg scores at each prefix length for no_wm_output_attacked:")
+            print(
+                gen_table_w_detect_retrieval_ds.to_pandas()
+                .groupby("prefix_length")["w_wm_output_attacked_retrieval_score"]
+                .describe()
+            )
+    ###########################################################################
+    # Detectgpt detection
+    ###########################################################################
+    if "detectgpt" in args.evaluation_metrics:
+        assert args.evaluation_metrics == ["detectgpt"], (
+            f"Detectgpt must be run separately from other metrics. "
+            f"Found: {args.evaluation_metrics}. "
+        )
+        # check that the right score column exists
+        assert any(
+            ["detectgpt_score" in col for col in gen_table_w_detect_retrieval_ds.column_names]
+        ), (
+            f"Detectgpt metric requires the detectgpt_score column to be computed previously "
+            f"but no such cols exist in this file."
+        )
+        print(
+            f"Evaluating detectgpt by simply computing ROC-AUC metrics on the scores that already exist"
+        )
+        gen_table_w_metrics_ds = gen_table_w_detect_retrieval_ds
+        # if we loaded an attack file, since detect gpt only outputs a baseline score col
+        # and a no_wm_output score col (which is implcitly the attack col if the file was attacked)
+        # we need to add the attacked score col to the dataset, and remove the no_wm score col
+        if loaded_attacked:
+            for suff in ["100_d", "100_z"]:
+                gen_table_w_metrics_ds = gen_table_w_metrics_ds.add_column(
+                    f"w_wm_output_attacked_detectgpt_score_{suff}",
+                    gen_table_w_metrics_ds[f"no_wm_output_detectgpt_score_{suff}"],
+                )
+                gen_table_w_metrics_ds = gen_table_w_metrics_ds.remove_columns(
+                    [f"no_wm_output_detectgpt_score_{suff}"]
+                )
+    else:
+        ###########################################################################
+        # Write the final dataset out to disk in jsonl format
+        # with the metrics added
+        ###########################################################################
+        # last applied metric, NOTE which will of course change as more are added
+        gen_table_w_metrics_ds = gen_table_w_detect_retrieval_ds
+        # write the metadata file, which is a union of the previous metadata
+        # and the current cmdline args
+        write_json(args.__dict__, metrics_meta_path, indent=4)
+        gen_table_w_metrics_lst = [ex for ex in gen_table_w_metrics_ds]
+        write_jsonlines(gen_table_w_metrics_lst, gen_table_w_metrics_path)
+    ###########################################################################
+    # Log the metric series to wandb
+    ###########################################################################
+    # log the metrics to wandb
+    if args.wandb:
+        # find cols that should be logged in a table
+        tabular_column_types = ["string", "bool"]
+        tabular_column_names = [
+            name
+            for name, _ in filter(
+                lambda tup: tup[1].dtype in tabular_column_types,
+                gen_table_w_metrics_ds.features.items(),
+            )
+        ]
+        # the rest should be logged as series
+        series_column_names = [
+            name
+            for name, _ in filter(
+                lambda tup: tup[1].dtype not in tabular_column_types,
+                gen_table_w_metrics_ds.features.items(),
+            )
+        ]
+        for metric_name in series_column_names:
+            # summarize series metrics as mean by default
+            wandb.define_metric(metric_name, summary="mean")
+        if args.log_raw_series:
+            # log the raw series
+            for example in tqdm(
+                gen_table_w_metrics_ds.remove_columns(tabular_column_names),
+                desc="Logging series metrics to wandb",
+            ):
+                run.log(example)
+        if args.log_raw_tabular:
+            # log the raw tabular data
+            # but also include the dataset index as a column
+            series_column_names.remove("idx")
+            table = wandb.Table(
+                dataframe=gen_table_w_metrics_ds.remove_columns(series_column_names).to_pandas()
+            )
+            run.log({"output_table": table})
+        ###########################################################################
+        # Filter rows, then log means to wandb
+        ###########################################################################
+        assert (
+            args.target_T - args.lower_tolerance_T
+        ) >= 0, "target_T - lower_tolerance_T must be >= 0"
+        target_T = args.target_T
+        lower_tolerance = args.lower_tolerance_T
+        upper_tolerance = args.upper_tolerance_T
+        filtered_table = gen_table_w_metrics_ds.to_pandas()  # explictly convert lists
+        for col in args.filter_by_columns:
+            length_col_name = infer_length_column(col, filtered_table, args=args)
+            filtered_table = filter_text_col_length(
+                filtered_table,
+                text_col_name=length_col_name,
+                count_suffix="",
+                upper_T=target_T + upper_tolerance,
+                lower_T=target_T - lower_tolerance,
+            )
+        # Save filtered mean values:
+        for metric_name in series_column_names:
+            filtered_name = f"f_{target_T}p{upper_tolerance}m{lower_tolerance}_{metric_name}"
+            try:
+                run.summary[f"{filtered_name}_mean"] = filtered_table[metric_name].mean()
+                run.summary[f"{filtered_name}_std"] = filtered_table[metric_name].std()
+            except TypeError:
+                two_dim_mean = filtered_table[metric_name].apply(np.mean).mean()
+        ###########################################################################
+        # Compute ROC-AUC and send to wandb
+        ###########################################################################
+        try:
+            test_stats = args.roc_test_stat
+            if isinstance(test_stats, str):
+                test_stats = [test_stats]
+            for test_stat in test_stats:
+                for attacked in [True, False]:
+                    try:
+                        roc_auc, fpr, tpr, thresholds, tpr_at_X_fpr = _roc_metrics_for_wandb(
+                            filtered_table, test_stat, attacked=attacked
+                        )
+                        run.summary[
+                            f"{'attacked_' if attacked else ''}{test_stat}_roc_auc"
+                        ] = roc_auc
+                        run.summary[
+                            f"{'attacked_' if attacked else ''}{test_stat}_tpr_at_X_fpr"
+                        ] = tpr_at_X_fpr
+                        # for tp, fp, thr in tqdm(
+                        #     zip(tpr, fpr, thresholds), desc="Logging ROC curve"
+                        # ):
+                        #     run.log(
+                        #         {
+                        #             f"{'attacked_' if attacked else ''}{test_stat}_fpr": fp,
+                        #             f"{'attacked_' if attacked else ''}{test_stat}_tpr": tp,
+                        #             f"{'attacked_' if attacked else ''}thr": thr,
+                        #         }
+                        #     )
+                        data = [[x, y] for (x, y) in zip(fpr, tpr)]
+                        table = wandb.Table(data=data, columns=["fpr", "tpr"])
+                        run.log(
+                            {
+                                f"{'attacked_' if attacked else ''}{test_stat}": wandb.plot.line(
+                                    table,
+                                    "fpr",
+                                    "tpr",
+                                    title=f"ROC ({test_stat}{',attacked' if attacked else ',clean'})",
+                                )
+                            }
+                        )
+                        print(f"Successfully logged ROC-AUC metrics for {test_stat}.")
+                    except Exception as e:
+                        if args.verbose:
+                            print(e)
+                            print(
+                                f"Failed to log ROC-AUC metrics for {'attacked output' if attacked else ''} {test_stat}."
+                                f"Metric probably was not computed and or attack col not present."
+                            )
+        except Exception as e:
+            if args.verbose:
+                print(f"Exception: {e}")
+                print(
+                    f"Failed to log ROC-AUC metrics. ",
+                    f"Make sure the test statistic required for detection ({test_stat}) has been computed!",
+                )
+        ################################################################################
+        # NOTE we do that ^^^ basic ROC logic first because it's faster
+        # as well as the manual prefix lengths at T logic bc that's also faster
+        ################################################################################
+        # Handle z @ T but for the retrieval and detectgpt scores that are evaluated
+        # manually at each prefix length.  Use groupby to compute the mean and std
+        # for each prefix length for any of the feats that have retrieval_score in them,
+        # then log those pairs to wandb.
+        at_T_df = gen_table_w_metrics_ds.to_pandas()
+        for name, feat in gen_table_w_metrics_ds.features.items():
+            if "retrieval_score" in name and "prefix_length" in at_T_df.columns:
+                # compute the mean and std for each prefix length
+                # and log those pairs to wandb
+                df_view = at_T_df.groupby("prefix_length")[name].describe()[["mean", "std"]]
+                T_indices = df_view.index
+                # for idx, (mean, std) in df_view.iterrows():
+                #     run.log(data={f"{name}_mean": mean, f"{name}_std": std, "idx_T": idx})
+                # log this triple as a table instead like the ROC curve above
+                # where the first two are plotted and the third is the x axis
+                data = [[x, y, z] for x, (y, z) in df_view.iterrows()]
+                table = wandb.Table(data=data, columns=["idx_T", "mean", "std"])
+                # compute stderr from std
+                table.add_column(
+                    "stderr",
+                    [
+                        std / np.sqrt(len(at_T_df[at_T_df["prefix_length"] == idx]))
+                        for idx, std in zip(T_indices, df_view["std"])
+                    ],
+                )
+                # first log mean
+                run.log({f"{name}": wandb.plot.line(table, "idx_T", "mean", title=f"{name} mean")})
+                # then log std err
+                run.log(
+                    {
+                        f"{name}_stderr": wandb.plot.line(
+                            table, "idx_T", "stderr", title=f"{name} stderr"
+                        )
+                    }
+                )
+                # also compute an AUC at each prefix len idx by treating the name col as the positives
+                # and the baseline_completion_retrieval_score as the negatives
+                # then log those pairs to wandb
+                if name != "baseline_completion_retrieval_score":
+                    pos_negs_at_T = at_T_df.groupby("prefix_length")[
+                        [name, "baseline_completion_retrieval_score"]
+                    ]
+                    # auc_at_T = []
+                    # tpr_at_X_fpr = []
+                    all_aucs, all_tpr_at_X_fpr = [], []
+                    for idx, sub_df in pos_negs_at_T:
+                        pos = sub_df[name]
+                        neg = sub_df["baseline_completion_retrieval_score"]
+                        # convert to arrays and remove nans
+                        pos = pos.to_numpy()[~np.isnan(pos.to_numpy())]
+                        neg = neg.to_numpy()[~np.isnan(neg.to_numpy())]
+                        fpr, tpr, thresholds = metrics.roc_curve(
+                            np.concatenate([np.ones_like(pos), np.zeros_like(neg)]),  # labels
+                            np.concatenate([pos, neg]),  # scores
+                            pos_label=1,
+                        )
+                        auc = metrics.auc(fpr, tpr)
+                        try:
+                            tpr_at_X_fpr = tpr[np.where(fpr < 1e-3)[0][-1]]
+                        except IndexError:
+                            tpr_at_X_fpr = float("NaN")
+                        all_aucs.append(auc)
+                        all_tpr_at_X_fpr.append(tpr_at_X_fpr)
+                        # run.log(data={f"{name}_auc_at_T": auc, "idx_T": idx})
+                    # log this triple as a table instead like the AUC and tpr at X fpr below
+                    # where the first two are plotted and the third is the x axis
+                    data = [
+                        [x, y, z] for x, (y, z) in zip(T_indices, zip(all_aucs, all_tpr_at_X_fpr))
+                    ]
+                    table = wandb.Table(data=data, columns=["idx_T", "aucs", "tpr_at"])
+                    run.log(
+                        {
+                            f"{name}_aucs": wandb.plot.line(
+                                table, "idx_T", "aucs", title=f"{name} aucs"
+                            )
+                        }
+                    )
+                    run.log(
+                        {
+                            f"{name}_tpr_at": wandb.plot.line(
+                                table, "idx_T", "tpr_at", title=f"{name} tpr_at"
+                            )
+                        }
+                    )
+            elif "detectgpt_score" in name and "prefix_length" in at_T_df.columns:
+                # this covers detectgpt_score_100_d and variants
+                # compute the mean and std for each prefix length
+                # and log those pairs to wandb
+                df_view = at_T_df.groupby("prefix_length")[name].describe()[["mean", "std"]]
+                T_indices = df_view.index
+                # for idx, (mean, std) in df_view.iterrows():
+                #     run.log(data={f"{name}_mean": mean, f"{name}_std": std, "idx_T": idx})
+                # log this triple as a table instead like the ROC curve above
+                # where the first two are plotted and the third is the x axis
+                data = [[x, y, z] for x, (y, z) in df_view.iterrows()]
+                table = wandb.Table(data=data, columns=["idx_T", "mean", "std"])
+                # compute stderr from std
+                table.add_column(
+                    "stderr",
+                    [
+                        std / np.sqrt(len(at_T_df[at_T_df["prefix_length"] == idx]))
+                        for idx, std in zip(T_indices, df_view["std"])
+                    ],
+                )
+                # first log mean
+                run.log({f"{name}": wandb.plot.line(table, "idx_T", "mean", title=f"{name} mean")})
+                # then log std err
+                run.log(
+                    {
+                        f"{name}_stderr": wandb.plot.line(
+                            table, "idx_T", "stderr", title=f"{name} stderr"
+                        )
+                    }
+                )
+                # also compute an AUC at each prefix len idx by treating the name col as the positives
+                # and the baseline_completion_retrieval_score as the negatives
+                # then log those pairs to wandb
+                if "baseline_completion_detectgpt_score" not in name:
+                    # check which suffix this is in ["_100_d", "_100_z"]
+                    # and use that to set the baseline/falst col
+                    if name.endswith("_100_d"):
+                        baseline_col = "baseline_completion_detectgpt_score_100_d"
+                    elif name.endswith("_100_z"):
+                        baseline_col = "baseline_completion_detectgpt_score_100_z"
+                    pos_negs_at_T = at_T_df.groupby("prefix_length")[[name, baseline_col]]
+                    # auc_at_T = []
+                    # tpr_at_X_fpr = []
+                    all_aucs, all_tpr_at_X_fpr = [], []
+                    for idx, sub_df in pos_negs_at_T:
+                        pos = sub_df[name]
+                        neg = sub_df[baseline_col]
+                        # convert to arrays and remove nans
+                        pos = pos.to_numpy()[~np.isnan(pos.to_numpy())]
+                        neg = neg.to_numpy()[~np.isnan(neg.to_numpy())]
+                        fpr, tpr, thresholds = metrics.roc_curve(
+                            np.concatenate([np.ones_like(pos), np.zeros_like(neg)]),  # labels
+                            np.concatenate([pos, neg]),  # scores
+                            pos_label=1,
+                        )
+                        auc = metrics.auc(fpr, tpr)
+                        try:
+                            tpr_at_X_fpr = tpr[np.where(fpr < 1e-3)[0][-1]]
+                        except IndexError:
+                            tpr_at_X_fpr = float("NaN")
+                        all_aucs.append(auc)
+                        all_tpr_at_X_fpr.append(tpr_at_X_fpr)
+                        # run.log(data={f"{name}_auc_at_T": auc, "idx_T": idx})
+                    # log this triple as a table instead like the AUC and tpr at X fpr below
+                    # where the first two are plotted and the third is the x axis
+                    data = [
+                        [x, y, z] for x, (y, z) in zip(T_indices, zip(all_aucs, all_tpr_at_X_fpr))
+                    ]
+                    table = wandb.Table(data=data, columns=["idx_T", "aucs", "tpr_at"])
+                    run.log(
+                        {
+                            f"{name}_aucs": wandb.plot.line(
+                                table, "idx_T", "aucs", title=f"{name} aucs"
+                            )
+                        }
+                    )
+                    run.log(
+                        {
+                            f"{name}_tpr_at": wandb.plot.line(
+                                table, "idx_T", "tpr_at", title=f"{name} tpr_at"
+                            )
+                        }
+                    )
+        ###########################################################################
+        # Compute our @ T detection metrics and send to wandb
+        ###########################################################################
+        # Merge z_at_T and other sequence metrics so they can be shown in wandb:
+        for name, feat in gen_table_w_metrics_ds.features.items():
+            if isinstance(feat, Sequence):
+                max_feat_seq_len = max([len(l) for l in gen_table_w_metrics_ds[name]])
+                merging_seq = np.zeros(max_feat_seq_len)
+                counts = np.zeros(max_feat_seq_len)
+                proto_variance = np.zeros(max_feat_seq_len)
+                for entry in gen_table_w_metrics_ds[name]:
+                    len_seq = len(entry)
+                    delta = entry * counts[:len_seq] - merging_seq[:len_seq]
+                    # Accumulate ragged sum over entries:
+                    counts[:len_seq] += 1
+                    merging_seq[:len_seq] += entry[: len(merging_seq)]
+                    # Compute ragged, running variance via Welford:
+                    gamma = entry * counts[:len_seq] - merging_seq[:len_seq]
+                    proto_variance[:len_seq] += (delta / counts[:len_seq]) * (
+                        gamma / counts[:len_seq]
+                    )
+                mask = counts != 0
+                averaged_seq = merging_seq.copy()
+                averaged_seq[mask] /= counts
+                averaged_seq[~mask] = float("NaN")
+                seq_stderr = proto_variance.copy()
+                seq_stderr[counts > 1] = np.sqrt(
+                    proto_variance[counts > 1] / (counts[counts > 1] - 1)
+                ) / np.sqrt(counts[counts > 1])
+                seq_stderr[counts <= 1] = float("NaN")
+                # for idx, (avg, stderr) in enumerate(zip(averaged_seq[mask], seq_stderr[mask])):
+                #     run.log(data={f"{name}_avg": avg, f"{name}_stderr": stderr, "idx_T": idx})
+                # log this triple as a table instead like the ROC curve above
+                # where the first two are plotted and the third is the x axis
+                data = [
+                    [x, y, z]
+                    for (x, y, z) in zip(
+                        averaged_seq[mask], seq_stderr[mask], range(len(averaged_seq[mask]))
+                    )
+                ]
+                table = wandb.Table(data=data, columns=["avg", "stderr", "idx_T"])
+                # first plot avg
+                run.log({f"{name}": wandb.plot.line(table, "idx_T", "avg", title=f"{name} avg")})
+                # then plot stderr
+                run.log(
+                    {
+                        f"{name}_stderr": wandb.plot.line(
+                            table, "idx_T", "stderr", title=f"{name} stderr"
+                        )
+                    }
+                )
+        # Compute AUC_at_T
+        # For now we'll just do a dumb loop over scipy.roc_curve, but this could be batched
+        test_stats = args.roc_test_stat
+        if isinstance(test_stats, str):
+            test_stats = [test_stats]
+        for test_stat in test_stats:
+            for attacked in [True, False]:
+                base_col = f"baseline_completion_{test_stat}_at_T"
+                w_wm_col = f"w_wm_output{'_attacked' if attacked else ''}_{test_stat}_at_T"
+                name = f"w_wm{'_attacked' if attacked else ''}_{test_stat}_at_T"
+                if w_wm_col in gen_table_w_metrics_ds.features.keys():  # metric was computed
+                    print(f"Computing AUC at T for {name}.")
+                    max_length = min(
+                        max([len(l) for l in gen_table_w_metrics_ds[base_col]]),
+                        max([len(l) for l in gen_table_w_metrics_ds[w_wm_col]]),
+                    )
+                    all_aucs, all_tpr_at_X_fpr = [], []
+                    for T in range(1, max_length):
+                        w_wm_stats = np.array(
+                            [t[T] for t in gen_table_w_metrics_ds[w_wm_col] if len(t) > T]
+                        )
+                        baseline_stats = np.array(
+                            [t[T] for t in gen_table_w_metrics_ds[base_col] if len(t) > T]
+                        )[: len(w_wm_stats)]
+                        all_scores = np.concatenate([baseline_stats, w_wm_stats])
+                        baseline_labels = np.zeros_like(baseline_stats)
+                        attacked_labels = np.ones_like(w_wm_stats)
+                        all_labels = np.concatenate([baseline_labels, attacked_labels])
+                        if len(np.unique(all_labels)) < 2:
+                            roc_auc = float("NaN")
+                            tpr_at_X_fpr = float("NaN")
+                        else:
+                            fpr, tpr, thresholds = metrics.roc_curve(
+                                all_labels, all_scores, pos_label=1
+                            )
+                            roc_auc = metrics.auc(fpr, tpr)
+                            try:
+                                tpr_at_X_fpr = tpr[np.where(fpr < 1e-3)[0][-1]]
+                            except IndexError:
+                                tpr_at_X_fpr = float("NaN")
+                        all_aucs.append(roc_auc)
+                        all_tpr_at_X_fpr.append(tpr_at_X_fpr)
+                    # for idx, (aucs, tpr_at) in enumerate(zip(all_aucs, all_tpr_at_X_fpr)):
+                    #     run.log(data={f"{name}_aucs": aucs, f"{name}_tpr_at": tpr_at, "idx_T": idx})
+                    # log these two separately using a table
+                    data = [
+                        [x, y, z]
+                        for (x, y, z) in zip(all_aucs, all_tpr_at_X_fpr, range(len(all_aucs)))
+                    ]
+                    table = wandb.Table(data=data, columns=["aucs", "tpr_at", "idx_T"])
+                    run.log(
+                        {
+                            f"{name}_aucs": wandb.plot.line(
+                                table, "idx_T", "aucs", title=f"{name} aucs"
+                            )
+                        }
+                    )
+                    run.log(
+                        {
+                            f"{name}_tpr_at": wandb.plot.line(
+                                table, "idx_T", "tpr_at", title=f"{name} tpr_at"
+                            )
+                        }
+                    )
+        # finish the wandb run
+        run.finish()
+    return
+def _roc_metrics_for_wandb(
+    gen_table_ds, test_stat="z_score", prefix="", attacked=False, remove_nan=True
+):
+    # In theory, we actually should be filtering the attacked column too, but we know these
+    # end up very short sometimes. So, to make sure the logic works, we just
+    # filter for any rows where the test metrics are NaN and note the damage
+    baseline_col_name = f"{prefix}baseline_completion_{test_stat}"
+    if "retrieval" in test_stat:
+        if attacked:
+            w_wm_col_name = f"{prefix}w_wm_output_attacked_retrieval_score"
+        else:
+            w_wm_col_name = f"{prefix}{args.retrieval_db_column}_retrieval_score"
+    elif "detectgpt" in test_stat:
+        if attacked:
+            w_wm_col_name = f"{prefix}w_wm_output_attacked_{test_stat}"
+        else:
+            w_wm_col_name = f"{prefix}no_wm_output_{test_stat}"
+    else:
+        w_wm_col_name = f"{prefix}w_wm_output{'_attacked' if attacked else ''}_{test_stat}"
+    # drop nans in either column
+    if remove_nan:
+        orig_length = len(gen_table_ds)
+        gen_table_ds = gen_table_ds.dropna(subset=[baseline_col_name, w_wm_col_name])
+        if orig_length != len(gen_table_ds):
+            print(
+                f"NOTE: During ROC calculation, dropped {orig_length - len(gen_table_ds)} rows due to NaNs in {baseline_col_name} or {w_wm_col_name}"
+            )
+    baseline_stats = gen_table_ds[baseline_col_name].values
+    w_wm_stats = gen_table_ds[w_wm_col_name].values
+    all_scores = np.concatenate([baseline_stats, w_wm_stats])
+    baseline_labels = np.zeros_like(baseline_stats)
+    attacked_labels = np.ones_like(w_wm_stats)
+    all_labels = np.concatenate([baseline_labels, attacked_labels])
+    fpr, tpr, thresholds = metrics.roc_curve(all_labels, all_scores, pos_label=1)
+    roc_auc = metrics.auc(fpr, tpr)
+    try:
+        tpr_at_X_fpr = tpr[np.where(fpr < 1e-3)[0][-1]]
+    except IndexError:
+        tpr_at_X_fpr = float("NaN")
+    return roc_auc, fpr, tpr, thresholds, tpr_at_X_fpr
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run evaluation pipeline for watermark detection")
+    parser.add_argument(
+        "--evaluation_metrics",
+        type=str,
+        default="all",
+        help="Comma separated list of columns to remove from the dataset before generation.",
+    )
+    parser.add_argument(
+        "--compute_scores_at_T",
+        type=str2bool,
+        default=True,
+        help="Whether to compute (applicable) metrics at each T index in the output/text columns.",
+    )
+    parser.add_argument(
+        "--overwrite_args",
+        type=str2bool,
+        default=False,
+        help="Whether to overwrite the shared args in the metadata file with the current, runtime args.",
+    )
+    parser.add_argument(
+        "--oracle_model_name_or_path",
+        type=str,
+        default="facebook/opt-6.7b",
+        help="Oracle model, path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--load_fp16",
+        type=str2bool,
+        default=None,
+        help=(
+            "Whether to run model (for ppl) in float16 precsion, note, will overwrite error as a reminder that "
+            "generation was run in other mode, even though there's no hard requirement that these match."
+        ),
+    )
+    parser.add_argument(
+        "--ppl_batch_size",
+        type=int,
+        default=1,
+        help="Batch size for ppl eval.",
+    )
+    parser.add_argument(
+        "--seeding_scheme",
+        type=Union[str, NoneType],
+        default=None,
+        help="Seeding scheme to use to generate the greenlists at each generation and verification step.",
+    )
+    parser.add_argument(
+        "--gamma",
+        type=Union[float, NoneType],
+        default=None,
+        help="The fraction of the vocabulary to partition into the greenlist at each generation and verification step.",
+    )
+    parser.add_argument(
+        "--normalizers",
+        type=Union[str, NoneType],
+        default=None,
+        help="Single or comma separated list of the preprocessors/normalizer names to use when performing watermark detection.",
+    )
+    parser.add_argument(
+        "--ignore_repeated_ngrams",
+        type=str2bool,
+        default=False,
+        help="Whether to use the detection method that only counts each unqiue bigram once as either a green or red hit.",
+    )
+    parser.add_argument(
+        "--detection_z_threshold",
+        type=float,
+        default=4.0,
+        help="The test statistic threshold for the detection hypothesis test.",
+    )
+    parser.add_argument(
+        "--return_green_token_mask",
+        type=str2bool,
+        default=True,
+        help="Whether to return the mask marking which tokens are green from the watermark detector.",
+    )
+    parser.add_argument(
+        "--window_settings",
+        type=str,
+        default="20,40,max",  # can also be "20" or "20,40,max"
+        help="Comma separated list of window sizes to use for watermark detection. Only used if 'windowed-z-score' is in the evaluation metrics list.",
+    )
+    parser.add_argument(
+        "--run_len_chisqrd_variant",
+        type=str,
+        default="F_succ_T_runs",
+        choices=["F_succ_T_runs", "T_and_F_runs"],
+        help="The variant of the run length test to use for watermark detection.",
+    )
+    parser.add_argument(
+        "--run_len_chisqrd_bin_spec",
+        type=str,
+        default="max_plus_1",
+        choices=["max", "max_plus_1"],
+        help="The binning specification to use for the run length test.",
+    )
+    parser.add_argument(
+        "--run_len_chisqrd_mask_zeros",
+        type=str2bool,
+        default=True,
+        help="Whether to mask zeros in the run length test.",
+    )
+    parser.add_argument(
+        "--run_len_chisqrd_mask_leading_bins",
+        type=int,
+        default=0,
+        help="The number of leading bins to mask in the run length test.",
+    )
+    parser.add_argument(
+        "--run_len_chisqrd_lambda",
+        type=str,
+        default="pearson",
+        choices=["pearson", "g_test", "cressie_read"],
+        help="The lambda_ param to use for the run length test.",
+    )
+    parser.add_argument(
+        "--retrieval_technique",
+        type=str,
+        default="bm25",
+        choices=["bm25", "sim"],
+        help="The retrieval technique to use for retrieval detection.",
+    )
+    parser.add_argument(
+        "--retrieval_db_column",
+        type=str,
+        default="no_wm_output",
+        choices=["w_wm_output", "no_wm_output"],
+        help="The column to populate the db/index with use for retrieval detection.",
+    )
+    parser.add_argument(
+        "--retrieval_db_load_all_prefixes",
+        type=str2bool,
+        default=False,
+        help="Whether to load all prefixes into the retrieval db, or just the longest for each unique entry.",
+    )
+    parser.add_argument(
+        "--roc_test_stat",
+        type=str,
+        default="all",
+        help="The comma separated list of test statistics to use for the ROC-AUC metric.",
+    )
+    parser.add_argument(
+        "--target_T",
+        type=int,
+        default=0,
+        help="The target generation length to use when dropping rows before ROC-AUC evaluation.",
+    )
+    parser.add_argument(
+        "--lower_tolerance_T",
+        type=int,
+        default=25,
+        help="The lower tolerance to use when dropping rows before ROC-AUC evaluation.",
+    )
+    parser.add_argument(
+        "--upper_tolerance_T",
+        type=int,
+        default=25,
+        help="The upper tolerance to use when dropping rows before ROC-AUC evaluation.",
+    )
+    parser.add_argument(
+        "--filter_by_columns",
+        type=str,
+        default="all",
+        help="The comma separated list of columns to filter by before ROC-AUC evaluation.",
+    )
+    parser.add_argument(
+        "--wandb",
+        type=str2bool,
+        default=False,
+        help="Whether to log to wandb.",
+    )
+    parser.add_argument(
+        "--wandb_project",
+        type=str,
+        default="lm-watermarking",
+        help="The name of the wandb project.",
+    )
+    parser.add_argument(
+        "--wandb_entity",
+        type=str,
+        default="jwkirchenbauer",
+        help="The wandb entity/user for the project.",
+    )
+    parser.add_argument(
+        "--wandb_tags",
+        type=str,
+        default="",
+        help="The comma separated list of tags to add to the wandb run.",
+    )
+    parser.add_argument(
+        "--run_name",
+        type=str,
+        default="",
+        help="The unique name for the run.",
+    )
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        default="./input",
+        help="The directory containing the input files.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="",
+        help=(
+            "The directory in which to write out the dataset after adding the metrics. "
+            "If not specified, will use the input_dir. Note, if the output_dir already "
+            "contains the metric-enriched file, it will be overwritten :/"
+        ),
+    )
+    parser.add_argument(
+        "--overwrite_output_file",
+        type=str2bool,
+        default=False,
+        help="Whether to overwrite the output file if it already exists.",
+    )
+    parser.add_argument(
+        "--limit_rows",
+        type=int,
+        default=-1,
+        help="The number of rows to limit the dataset to. Useful for debugging.",
+    )
+    parser.add_argument(
+        "--concat_rows",
+        type=int,
+        default=0,
+        help="The number of rows to concatenate into a single row. Result is a mangled dataset, be careful",
+    )
+    parser.add_argument(
+        "--shuffle_before_concat",
+        type=str2bool,
+        default=False,
+        help="Whether to shuffle the dataset before concatenating rows.",
+    )
+    parser.add_argument(
+        "--verbose",
+        type=str2bool,
+        default=None,
+        help="Whether to verbosely print things here and there.",
+    )
+    parser.add_argument(
+        "--log_raw_series",
+        type=str2bool,
+        default=True,
+        help="Whether to log the raw series metric data to wandb.",
+    )
+    parser.add_argument(
+        "--log_raw_tabular",
+        type=str2bool,
+        default=True,
+        help="Whether to log the raw tabular metric data to wandb.",
+    )
+    args = parser.parse_args()
+    ###########################################################################
+    # Argument validation and conditional setting
+    ###########################################################################
+    # convert evaluation metrics to list
+    assert args.evaluation_metrics, "evaluation_metrics list must be specified"
+    args.evaluation_metrics = args.evaluation_metrics.split(",")
+    if args.evaluation_metrics == ["all"]:
+        all_metrics = SUPPORTED_METRICS
+        all_metrics.remove("ppl")  # by default not running this anymore
+        all_metrics.remove("detectgpt")  # can't run this with other metrics
+        args.evaluation_metrics = all_metrics
+    if args.evaluation_metrics == ["all_w_ppl"]:
+        args.evaluation_metrics = SUPPORTED_METRICS
+    # if no output dir specified, use the input dir
+    if args.output_dir == "":
+        args.output_dir = args.input_dir
+    # check limit_rows
+    assert (args.limit_rows == -1) or (
+        (args.limit_rows > 0) and isinstance(args.limit_rows, int)
+    ), "limit_rows must be -1 or > 0"
+    # convert normalizers to list
+    if args.normalizers:
+        args.normalizers = args.normalizers.split(",")
+    else:
+        args.normalizers = []
+    # convert roc_test_stat to list
+    args.roc_test_stat = args.roc_test_stat.split(",")
+    if args.roc_test_stat == ["all"]:
+        args.roc_test_stat = ROC_TEST_STAT_SUFFIXES
+    # convert filter_by_columns to list
+    args.filter_by_columns = args.filter_by_columns.split(",")
+    if args.filter_by_columns == ["all"]:
+        args.filter_by_columns = FILTER_BY_COLUMNS
+    # split wandb tags
+    if args.wandb_tags != "":
+        args.wandb_tags = args.wandb_tags.split(",")
+    else:
+        args.wandb_tags = []
+    # split window settings
+    args.window_settings = args.window_settings.split(",")
+    main(args)

lm-watermarking-main/watermark_reliability_release/figure_notebooks/baseline_comparison.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

lm-watermarking-main/watermark_reliability_release/figure_notebooks/baseline_comparison_transpose.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

lm-watermarking-main/watermark_reliability_release/figure_notebooks/core_robustness.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

lm-watermarking-main/watermark_reliability_release/figure_notebooks/data_model.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff