@article {2482, title = {Using Response Time to Detect Item Preknowledge in Computer?Based Licensure Examinations}, journal = {Educational Measurement: Issues and Practice.}, volume = {35}, number = {38{\textendash}47}, year = {2016}, abstract = {This article addresses the issue of how to detect item preknowledge using item response time data in two computer-based large-scale licensure examinations. Item preknowledge is indicated by an unexpected short response time and a correct response. Two samples were used for detecting item preknowledge for each examination. The first sample was from the early stage of the operational test and was used for item calibration. The second sample was from the late stage of the operational test, which may feature item preknowledge. The purpose of this research was to explore whether there was evidence of item preknowledge and compromised items in the second sample using the parameters estimated from the first sample. The results showed that for one nonadaptive operational examination, two items (of 111) were potentially exposed, and two candidates (of 1,172) showed some indications of preknowledge on multiple items. For another licensure examination that featured computerized adaptive testing, there was no indication of item preknowledge or compromised items. Implications for detected aberrant examinees and compromised items are discussed in the article.}, doi = {http://dx.doi.org/10.1111/emip.12102}, author = {Qian H. and Staniewska, D. and Reckase, M. and Woo, A.} } @article {2047, title = {Comparison Between Dichotomous and Polytomous Scoring of Innovative Items in a Large-Scale Computerized Adaptive Test}, journal = {Educational and Psychological Measurement}, volume = {72}, year = {2012}, pages = {493-509}, abstract = {

This study explored the impact of partial credit scoring of one type of innovative items (multiple-response items) in a computerized adaptive version of a large-scale licensure pretest and operational test settings. The impacts of partial credit scoring on the estimation of the ability parameters and classification decisions in operational test settings were explored in one real data analysis and two simulation studies when two different polytomous scoring algorithms, automated polytomous scoring and rater-generated polytomous scoring, were applied. For the real data analyses, the ability estimates from dichotomous and polytomous scoring were highly correlated; the classification consistency between different scoring algorithms was nearly perfect. Information distribution changed slightly in the operational item bank. In the two simulation studies comparing each polytomous scoring with dichotomous scoring, the ability estimates resulting from polytomous scoring had slightly higher measurement precision than those resulting from dichotomous scoring. The practical impact related to classification decision was minor because of the extremely small number of items that could be scored polytomously in this current study.

}, doi = {10.1177/0013164411422903}, author = {Jiao, H. and Liu, J. and Haynie, K. and Woo, A. and Gorham, J.} } @inbook {1954, title = {Developing item variants: An empirical study}, year = {2009}, note = {{PDF file, 194 KB}}, address = {D. J. Weiss (Ed.), Proceedings of the 2009 GMAC Conference on Computerized Adaptive Testing.}, abstract = {Large-scale standardized test have been widely used for educational and licensure testing. In computerized adaptive testing (CAT), one of the practical concerns for maintaining large-scale assessments is to ensure adequate numbers of high-quality items that are required for item pool functioning. Developing items at specific difficulty levels and for certain areas of test plans is a wellknown challenge. The purpose of this study was to investigate strategies for varying items that can effectively generate items at targeted difficulty levels and specific test plan areas. Each variant item generation model was developed by decomposing selected source items possessing ideal measurement properties and targeting the desirable content domains. 341 variant items were generated from 72 source items. Data were collected from six pretest periods. Items were calibrated using the Rasch model. Initial results indicate that variant items showed desirable measurement properties. Additionally, compared to an average of approximately 60\% of the items passing pretest criteria, an average of 84\% of the variant items passed the pretest criteria. }, author = {Wendt, A. and Kao, S. and Gorham, J. and Woo, A.} } @inbook {1822, title = {Limiting item exposure for target difficulty ranges in a high-stakes CAT}, year = {2009}, note = {MB}}, address = {D. J. Weiss (Ed.), Proceedings of the 2009 GMAC Conference on Computerized Adaptive Testing. {PDF File, 1.}, author = {Li, X. and Becker, K. and Gorham, J. and Woo, A.} }